2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_SSE2_H
37 #define GMX_SIMD_IMPL_X86_SSE2_H
40 #include <emmintrin.h>
44 /* Set capabilities that can be inherited */
45 #define GMX_SIMD_X86_SSE2_OR_HIGHER
47 /* x86 SSE2 SIMD instruction wrappers
49 * Please see documentation in gromacs/simd/simd.h for defines.
52 /* Capability definitions for SSE2 */
53 #define GMX_SIMD_HAVE_FLOAT
54 #define GMX_SIMD_HAVE_DOUBLE
55 #define GMX_SIMD_HAVE_HARDWARE
56 #define GMX_SIMD_HAVE_LOADU
57 #define GMX_SIMD_HAVE_STOREU
58 #define GMX_SIMD_HAVE_LOGICAL
59 #undef GMX_SIMD_HAVE_FMA
60 #undef GMX_SIMD_HAVE_FRACTION
61 #define GMX_SIMD_HAVE_FINT32
62 #define GMX_SIMD_HAVE_FINT32_EXTRACT /* No SSE2 instruction, but use shifts */
63 #define GMX_SIMD_HAVE_FINT32_LOGICAL
64 #define GMX_SIMD_HAVE_FINT32_ARITHMETICS
65 #define GMX_SIMD_HAVE_DINT32
66 #define GMX_SIMD_HAVE_DINT32_EXTRACT /* No SSE2 instruction, but use shifts */
67 #define GMX_SIMD_HAVE_DINT32_LOGICAL
68 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
69 #define GMX_SIMD4_HAVE_FLOAT
70 #undef GMX_SIMD4_HAVE_DOUBLE
72 /* Implementation details */
73 #define GMX_SIMD_FLOAT_WIDTH 4
74 #define GMX_SIMD_DOUBLE_WIDTH 2
75 #define GMX_SIMD_FINT32_WIDTH 4
76 #define GMX_SIMD_DINT32_WIDTH 2
77 #define GMX_SIMD_RSQRT_BITS 11
78 #define GMX_SIMD_RCP_BITS 11
80 /****************************************************
81 * SINGLE PRECISION SIMD IMPLEMENTATION *
82 ****************************************************/
83 #define gmx_simd_float_t __m128
84 #define gmx_simd_load_f _mm_load_ps
85 #define gmx_simd_load1_f _mm_load1_ps
86 #define gmx_simd_set1_f _mm_set1_ps
87 #define gmx_simd_store_f _mm_store_ps
88 #define gmx_simd_loadu_f _mm_loadu_ps
89 #define gmx_simd_storeu_f _mm_storeu_ps
90 #define gmx_simd_setzero_f _mm_setzero_ps
91 #define gmx_simd_add_f _mm_add_ps
92 #define gmx_simd_sub_f _mm_sub_ps
93 #define gmx_simd_mul_f _mm_mul_ps
94 #define gmx_simd_fmadd_f(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
95 #define gmx_simd_fmsub_f(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c)
96 #define gmx_simd_fnmadd_f(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
97 #define gmx_simd_fnmsub_f(a, b, c) _mm_sub_ps(_mm_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
98 #define gmx_simd_and_f _mm_and_ps
99 #define gmx_simd_andnot_f _mm_andnot_ps
100 #define gmx_simd_or_f _mm_or_ps
101 #define gmx_simd_xor_f _mm_xor_ps
102 #define gmx_simd_rsqrt_f _mm_rsqrt_ps
103 #define gmx_simd_rcp_f _mm_rcp_ps
104 #define gmx_simd_fabs_f(x) _mm_andnot_ps(_mm_set1_ps(GMX_FLOAT_NEGZERO), x)
105 #define gmx_simd_fneg_f(x) _mm_xor_ps(x, _mm_set1_ps(GMX_FLOAT_NEGZERO))
106 #define gmx_simd_max_f _mm_max_ps
107 #define gmx_simd_min_f _mm_min_ps
108 #define gmx_simd_round_f(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
109 #define gmx_simd_trunc_f(x) _mm_cvtepi32_ps(_mm_cvttps_epi32(x))
110 #define gmx_simd_fraction_f(x) _mm_sub_ps(x, gmx_simd_trunc_f(x))
111 #define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_sse2
112 #define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_sse2
113 #define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_sse2
114 /* integer datatype corresponding to float: gmx_simd_fint32_t */
115 #define gmx_simd_fint32_t __m128i
116 #define gmx_simd_load_fi(m) _mm_load_si128((const __m128i *)m)
117 #define gmx_simd_set1_fi _mm_set1_epi32
118 #define gmx_simd_store_fi(m, x) _mm_store_si128((__m128i *)m, x)
119 #define gmx_simd_loadu_fi(m) _mm_loadu_si128((const __m128i *)m)
120 #define gmx_simd_storeu_fi(m, x) _mm_storeu_si128((__m128i *)m, x)
121 #define gmx_simd_setzero_fi _mm_setzero_si128
122 #define gmx_simd_cvt_f2i _mm_cvtps_epi32
123 #define gmx_simd_cvtt_f2i _mm_cvttps_epi32
124 #define gmx_simd_cvt_i2f _mm_cvtepi32_ps
125 #define gmx_simd_extract_fi(x, i) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (i)))
126 /* Integer logical ops on gmx_simd_fint32_t */
127 #define gmx_simd_slli_fi _mm_slli_epi32
128 #define gmx_simd_srli_fi _mm_srli_epi32
129 #define gmx_simd_and_fi _mm_and_si128
130 #define gmx_simd_andnot_fi _mm_andnot_si128
131 #define gmx_simd_or_fi _mm_or_si128
132 #define gmx_simd_xor_fi _mm_xor_si128
133 /* Integer arithmetic ops on gmx_simd_fint32_t */
134 #define gmx_simd_add_fi _mm_add_epi32
135 #define gmx_simd_sub_fi _mm_sub_epi32
136 #define gmx_simd_mul_fi gmx_simd_mul_fi_sse2
137 /* Boolean & comparison operations on gmx_simd_float_t */
138 #define gmx_simd_fbool_t __m128
139 #define gmx_simd_cmpeq_f _mm_cmpeq_ps
140 #define gmx_simd_cmplt_f _mm_cmplt_ps
141 #define gmx_simd_cmple_f _mm_cmple_ps
142 #define gmx_simd_and_fb _mm_and_ps
143 #define gmx_simd_or_fb _mm_or_ps
144 #define gmx_simd_anytrue_fb _mm_movemask_ps
145 #define gmx_simd_blendzero_f _mm_and_ps
146 #define gmx_simd_blendnotzero_f(a, sel) _mm_andnot_ps(sel, a)
147 #define gmx_simd_blendv_f(a, b, s) _mm_or_ps(_mm_andnot_ps(s, a), _mm_and_ps(s, b))
148 #define gmx_simd_reduce_f(a) gmx_simd_reduce_f_sse2(a)
149 /* Boolean & comparison operations on gmx_simd_fint32_t */
150 #define gmx_simd_fibool_t __m128i
151 #define gmx_simd_cmpeq_fi _mm_cmpeq_epi32
152 #define gmx_simd_cmplt_fi _mm_cmplt_epi32
153 #define gmx_simd_and_fib _mm_and_si128
154 #define gmx_simd_or_fib _mm_or_si128
155 #define gmx_simd_anytrue_fib _mm_movemask_epi8
156 #define gmx_simd_blendzero_fi _mm_and_si128
157 #define gmx_simd_blendnotzero_fi(a, sel) _mm_andnot_si128(sel, a)
158 #define gmx_simd_blendv_fi(a, b, s) _mm_or_si128(_mm_andnot_si128(s, a), _mm_and_si128(s, b))
159 /* Conversions between different booleans */
160 #define gmx_simd_cvt_fb2fib _mm_castps_si128
161 #define gmx_simd_cvt_fib2fb _mm_castsi128_ps
163 /****************************************************
164 * DOUBLE PRECISION SIMD IMPLEMENTATION *
165 ****************************************************/
166 #define gmx_simd_double_t __m128d
167 #define gmx_simd_load_d _mm_load_pd
168 #define gmx_simd_load1_d _mm_load1_pd
169 #define gmx_simd_set1_d _mm_set1_pd
170 #define gmx_simd_store_d _mm_store_pd
171 #define gmx_simd_loadu_d _mm_loadu_pd
172 #define gmx_simd_storeu_d _mm_storeu_pd
173 #define gmx_simd_setzero_d _mm_setzero_pd
174 #define gmx_simd_add_d _mm_add_pd
175 #define gmx_simd_sub_d _mm_sub_pd
176 #define gmx_simd_mul_d _mm_mul_pd
177 #define gmx_simd_fmadd_d(a, b, c) _mm_add_pd(_mm_mul_pd(a, b), c)
178 #define gmx_simd_fmsub_d(a, b, c) _mm_sub_pd(_mm_mul_pd(a, b), c)
179 #define gmx_simd_fnmadd_d(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
180 #define gmx_simd_fnmsub_d(a, b, c) _mm_sub_pd(_mm_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
181 #define gmx_simd_and_d _mm_and_pd
182 #define gmx_simd_andnot_d _mm_andnot_pd
183 #define gmx_simd_or_d _mm_or_pd
184 #define gmx_simd_xor_d _mm_xor_pd
185 #define gmx_simd_rsqrt_d(x) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)))
186 /* Don't use FMA for sqrt N-R iterations - this saves 1 instruction without FMA hardware */
187 #define gmx_simd_rcp_d(x) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x)))
188 #define gmx_simd_fabs_d(x) _mm_andnot_pd(_mm_set1_pd(GMX_DOUBLE_NEGZERO), x)
189 #define gmx_simd_fneg_d(x) _mm_xor_pd(x, _mm_set1_pd(GMX_DOUBLE_NEGZERO))
190 #define gmx_simd_max_d _mm_max_pd
191 #define gmx_simd_min_d _mm_min_pd
192 #define gmx_simd_round_d(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
193 #define gmx_simd_trunc_d(x) _mm_cvtepi32_pd(_mm_cvttpd_epi32(x))
194 #define gmx_simd_fraction_d(x) _mm_sub_pd(x, gmx_simd_trunc_d(x))
195 #define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_sse2
196 #define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_sse2
197 #define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_sse2
198 /* integer datatype corresponding to double: gmx_simd_dint32_t */
199 #define gmx_simd_dint32_t __m128i
200 #define gmx_simd_load_di(m) _mm_loadl_epi64((const __m128i *)m)
201 #define gmx_simd_set1_di _mm_set1_epi32
202 #define gmx_simd_store_di(m, x) _mm_storel_epi64((__m128i *)m, x)
203 #define gmx_simd_loadu_di(m) _mm_loadl_epi64((const __m128i *)m)
204 #define gmx_simd_storeu_di(m, x) _mm_storel_epi64((__m128i *)m, x)
205 #define gmx_simd_setzero_di _mm_setzero_si128
206 #define gmx_simd_cvt_d2i _mm_cvtpd_epi32
207 #define gmx_simd_cvtt_d2i _mm_cvttpd_epi32
208 #define gmx_simd_cvt_i2d _mm_cvtepi32_pd
209 #define gmx_simd_extract_di(x, i) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (i)))
210 /* Integer logical ops on gmx_simd_dint32_t */
211 #define gmx_simd_slli_di _mm_slli_epi32
212 #define gmx_simd_srli_di _mm_srli_epi32
213 #define gmx_simd_and_di _mm_and_si128
214 #define gmx_simd_andnot_di _mm_andnot_si128
215 #define gmx_simd_or_di _mm_or_si128
216 #define gmx_simd_xor_di _mm_xor_si128
217 /* Integer arithmetic ops on integer datatype corresponding to double */
218 #define gmx_simd_add_di _mm_add_epi32
219 #define gmx_simd_sub_di _mm_sub_epi32
220 #define gmx_simd_mul_di gmx_simd_mul_di_sse2
221 /* Boolean & comparison operations on gmx_simd_double_t */
222 #define gmx_simd_dbool_t __m128d
223 #define gmx_simd_cmpeq_d _mm_cmpeq_pd
224 #define gmx_simd_cmplt_d _mm_cmplt_pd
225 #define gmx_simd_cmple_d _mm_cmple_pd
226 #define gmx_simd_and_db _mm_and_pd
227 #define gmx_simd_or_db _mm_or_pd
228 #define gmx_simd_anytrue_db _mm_movemask_pd
229 #define gmx_simd_blendzero_d _mm_and_pd
230 #define gmx_simd_blendnotzero_d(a, sel) _mm_andnot_pd(sel, a)
231 #define gmx_simd_blendv_d(a, b, sel) _mm_or_pd(_mm_andnot_pd(sel, a), _mm_and_pd(sel, b))
232 #define gmx_simd_reduce_d(a) gmx_simd_reduce_d_sse2(a)
234 /* Boolean & comparison operations on gmx_simd_dint32_t */
235 #define gmx_simd_dibool_t __m128i
236 #define gmx_simd_cmpeq_di _mm_cmpeq_epi32
237 #define gmx_simd_cmplt_di _mm_cmplt_epi32
238 #define gmx_simd_and_dib _mm_and_si128
239 #define gmx_simd_or_dib _mm_or_si128
240 #define gmx_simd_anytrue_dib(x) _mm_movemask_epi8(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 1, 0)))
241 #define gmx_simd_blendzero_di _mm_and_si128
242 #define gmx_simd_blendnotzero_di(a, sel) _mm_andnot_si128(sel, a)
243 #define gmx_simd_blendv_di(a, b, sel) _mm_or_si128(_mm_andnot_si128(sel, a), _mm_and_si128(sel, b))
244 #define gmx_simd_cvt_db2dib(x) _mm_shuffle_epi32(_mm_castpd_si128(x), _MM_SHUFFLE(2, 0, 2, 0))
245 #define gmx_simd_cvt_dib2db(x) _mm_castsi128_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 0, 0)))
246 /* Float/double conversion */
247 #define gmx_simd_cvt_f2dd(f, d0, d1) { *d0 = _mm_cvtps_pd(f); *d1 = _mm_cvtps_pd(_mm_movehl_ps(f, f)); }
248 #define gmx_simd_cvt_dd2f(d0, d1) _mm_movelh_ps(_mm_cvtpd_ps(d0), _mm_cvtpd_ps(d1))
251 /****************************************************
252 * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
253 ****************************************************/
254 static gmx_inline __m128 gmx_simdcall
255 gmx_simd_get_exponent_f_sse2(__m128 x)
257 const __m128 expmask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
258 const __m128i expbias = _mm_set1_epi32(127);
261 iexp = _mm_castps_si128(_mm_and_ps(x, expmask));
262 iexp = _mm_sub_epi32(_mm_srli_epi32(iexp, 23), expbias);
263 return _mm_cvtepi32_ps(iexp);
266 static gmx_inline __m128 gmx_simdcall
267 gmx_simd_get_mantissa_f_sse2(__m128 x)
269 const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
270 const __m128 one = _mm_set1_ps(1.0f);
272 x = _mm_and_ps(x, mantmask);
273 return _mm_or_ps(x, one);
276 static gmx_inline __m128 gmx_simdcall
277 gmx_simd_set_exponent_f_sse2(__m128 x)
279 const __m128i expbias = _mm_set1_epi32(127);
280 __m128i iexp = _mm_cvtps_epi32(x);
282 iexp = _mm_slli_epi32(_mm_add_epi32(iexp, expbias), 23);
283 return _mm_castsi128_ps(iexp);
286 static gmx_inline __m128i gmx_simdcall
287 gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
289 __m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
290 __m128i b1 = _mm_srli_si128(b, 4); /* - b[3] b[2] b[1] */
291 __m128i c = _mm_mul_epu32(a, b);
292 __m128i c1 = _mm_mul_epu32(a1, b1);
294 c = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[2]*b[2] a[0]*b[0] */
295 c1 = _mm_shuffle_epi32(c1, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[3]*b[3] a[1]*b[1] */
297 return _mm_unpacklo_epi32(c, c1);
300 static gmx_inline float gmx_simdcall
301 gmx_simd_reduce_f_sse2(__m128 a)
305 b = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
306 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
311 /****************************************************
312 * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
313 ****************************************************/
314 static gmx_inline __m128d gmx_simdcall
315 gmx_simd_get_exponent_d_sse2(__m128d x)
317 /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
318 const __m128d expmask = _mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
319 const __m128i expbias = _mm_set1_epi32(1023);
322 iexp = _mm_castpd_si128(_mm_and_pd(x, expmask));
323 iexp = _mm_sub_epi32(_mm_srli_epi64(iexp, 52), expbias);
324 iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0) );
325 return _mm_cvtepi32_pd(iexp);
328 static gmx_inline __m128d gmx_simdcall
329 gmx_simd_get_mantissa_d_sse2(__m128d x)
331 /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
332 const __m128d mantmask = _mm_castsi128_pd( _mm_set_epi32(0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF) );
333 const __m128d one = _mm_set1_pd(1.0);
335 x = _mm_and_pd(x, mantmask);
336 return _mm_or_pd(x, one);
339 static gmx_inline __m128d gmx_simdcall
340 gmx_simd_set_exponent_d_sse2(__m128d x)
342 const __m128i expbias = _mm_set1_epi32(1023);
343 __m128i iexp = _mm_cvtpd_epi32(x);
345 /* After conversion integers will be in slot 0,1. Move them to 0,2 so
346 * we can do a 64-bit shift and get them to the dp exponents. */
347 iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
348 iexp = _mm_slli_epi64(_mm_add_epi32(iexp, expbias), 52);
349 return _mm_castsi128_pd(iexp);
352 static gmx_inline __m128i gmx_simdcall
353 gmx_simd_mul_di_sse2(__m128i a, __m128i b)
357 a = _mm_unpacklo_epi32(a, _mm_setzero_si128()); /* 0 a[1] 0 a[0] */
358 b = _mm_unpacklo_epi32(b, _mm_setzero_si128()); /* 0 b[1] 0 b[0] */
360 c = _mm_mul_epu32(a, b); /* 0 a[1]*b[1] 0 a[0]*b[0] */
361 return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
364 static gmx_inline double gmx_simdcall
365 gmx_simd_reduce_d_sse2(__m128d a)
370 b = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(1, 1)));
375 /* Function to check whether SIMD operations have resulted in overflow */
377 gmx_simd_check_and_reset_overflow(void)
382 MXCSR = _mm_getcsr();
383 /* The overflow flag is bit 3 in the register */
387 /* Set the overflow flag to zero */
388 MXCSR = MXCSR & 0xFFF7;
398 /* SSE2 is already 4-wide in single, so we just reuse float datatype for SIMD4.
399 * SSE2 cannot do double-precision SIMD4.
401 #define gmx_simd4_float_t gmx_simd_float_t
402 #define gmx_simd4_load_f gmx_simd_load_f
403 #define gmx_simd4_load1_f gmx_simd_load1_f
404 #define gmx_simd4_set1_f gmx_simd_set1_f
405 #define gmx_simd4_store_f gmx_simd_store_f
406 #define gmx_simd4_loadu_f gmx_simd_loadu_f
407 #define gmx_simd4_storeu_f gmx_simd_storeu_f
408 #define gmx_simd4_setzero_f gmx_simd_setzero_f
409 #define gmx_simd4_add_f gmx_simd_add_f
410 #define gmx_simd4_sub_f gmx_simd_sub_f
411 #define gmx_simd4_mul_f gmx_simd_mul_f
412 #define gmx_simd4_fmadd_f gmx_simd_fmadd_f
413 #define gmx_simd4_fmsub_f gmx_simd_fmsub_f
414 #define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f
415 #define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f
416 #define gmx_simd4_and_f gmx_simd_and_f
417 #define gmx_simd4_andnot_f gmx_simd_andnot_f
418 #define gmx_simd4_or_f gmx_simd_or_f
419 #define gmx_simd4_xor_f gmx_simd_xor_f
420 #define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f
421 #define gmx_simd4_fabs_f gmx_simd_fabs_f
422 #define gmx_simd4_fneg_f gmx_simd_fneg_f
423 #define gmx_simd4_max_f gmx_simd_max_f
424 #define gmx_simd4_min_f gmx_simd_min_f
425 #define gmx_simd4_round_f gmx_simd_round_f
426 #define gmx_simd4_trunc_f gmx_simd_trunc_f
427 #define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse2
428 #define gmx_simd4_fbool_t gmx_simd_fbool_t
429 #define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f
430 #define gmx_simd4_cmplt_f gmx_simd_cmplt_f
431 #define gmx_simd4_cmple_f gmx_simd_cmple_f
432 #define gmx_simd4_and_fb gmx_simd_and_fb
433 #define gmx_simd4_or_fb gmx_simd_or_fb
434 #define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
435 #define gmx_simd4_blendzero_f gmx_simd_blendzero_f
436 #define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
437 #define gmx_simd4_blendv_f gmx_simd_blendv_f
438 #define gmx_simd4_reduce_f gmx_simd_reduce_f
440 /* SIMD4 Dotproduct helper function */
441 static gmx_inline float gmx_simdcall
442 gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
446 a = _mm_mul_ps(a, b);
447 c = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 2, 1)));
448 c = _mm_add_ps(c, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 3, 2)));
453 #endif /* GMX_SIMD_IMPL_X86_SSE2_H */