2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_SSE4_1_H
37 #define GMX_SIMD_IMPL_X86_SSE4_1_H
40 #include <smmintrin.h>
43 /* x86 SSE4.1 SIMD instruction wrappers
45 * Please see documentation in gromacs/simd/simd.h for the available
49 /* Inherit most of SSE4.1 from SSE2 */
50 #include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
51 /* Increment over SSE2 capabilities */
52 #define GMX_SIMD_X86_SSE4_1_OR_HIGHER
55 /* Override capability definitions from SSE2 */
56 #define GMX_SIMD4_HAVE_FLOAT_DOTPRODUCT3
58 /* Almost all SSE4.1 instructions already exist in SSE2, but a few of them
59 * can be implemented more efficiently in SSE4.1.
61 #undef gmx_simd_round_f
62 #define gmx_simd_round_f(x) _mm_round_ps(x, _MM_FROUND_NINT)
63 #undef gmx_simd_trunc_f
64 #define gmx_simd_trunc_f(x) _mm_round_ps(x, _MM_FROUND_TRUNC)
65 #undef gmx_simd_round_d
66 #define gmx_simd_round_d(x) _mm_round_pd(x, _MM_FROUND_NINT)
67 #undef gmx_simd_trunc_d
68 #define gmx_simd_trunc_d(x) _mm_round_pd(x, _MM_FROUND_TRUNC)
70 #undef gmx_simd_extract_fi
71 #define gmx_simd_extract_fi _mm_extract_epi32
72 #undef gmx_simd_mul_fi
73 #define gmx_simd_mul_fi _mm_mullo_epi32
75 #undef gmx_simd_extract_di
76 #define gmx_simd_extract_di _mm_extract_epi32
77 #undef gmx_simd_mul_di
78 #define gmx_simd_mul_di _mm_mullo_epi32
80 #undef gmx_simd_blendv_f
81 #define gmx_simd_blendv_f _mm_blendv_ps
82 #undef gmx_simd_blendv_d
83 #define gmx_simd_blendv_d _mm_blendv_pd
85 #undef gmx_simd_reduce_f
86 #define gmx_simd_reduce_f(a) gmx_simd_reduce_f_sse4_1(a)
87 #undef gmx_simd_reduce_d
88 #define gmx_simd_reduce_d(a) gmx_simd_reduce_d_sse4_1(a)
90 #undef gmx_simd_blendv_fi
91 #define gmx_simd_blendv_fi _mm_blendv_epi8
92 #undef gmx_simd_blendv_di
93 #define gmx_simd_blendv_di _mm_blendv_epi8
95 #undef gmx_simd4_dotproduct3_f
96 #define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse4_1
98 /* SIMD reduction function */
99 static gmx_inline float gmx_simdcall
100 gmx_simd_reduce_f_sse4_1(__m128 a)
104 a = _mm_hadd_ps(a, a);
105 a = _mm_hadd_ps(a, a);
110 /* SIMD4 Dotproduct helper function */
111 static gmx_inline float gmx_simdcall
112 gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
115 _MM_EXTRACT_FLOAT(f, _mm_dp_ps(a, b, 0x71), 0);
119 static gmx_inline double gmx_simdcall
120 gmx_simd_reduce_d_sse4_1(__m128d a)
124 a = _mm_hadd_pd(a, a);
129 #endif /* GMX_SIMD_IMPL_X86_SSE4_1_H */