2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_H
40 #include <immintrin.h>
42 /* It is cleaner to start the AVX implementation from scratch rather than
43 * first inheriting from SSE4.1, which in turn inherits from SSE2. However,
44 * the capabilities still form a superset.
46 #define GMX_SIMD_X86_SSE2_OR_HIGHER
47 #define GMX_SIMD_X86_SSE4_1_OR_HIGHER
48 #define GMX_SIMD_X86_AVX_256_OR_HIGHER
51 /* x86 256-bit AVX SIMD instruction wrappers
53 * Please see documentation in gromacs/simd/simd.h for defines.
56 /* Capability definitions for 256-bit AVX - no inheritance from SSE */
57 #define GMX_SIMD_HAVE_FLOAT
58 #define GMX_SIMD_HAVE_DOUBLE
59 #define GMX_SIMD_HAVE_SIMD_HARDWARE
60 #define GMX_SIMD_HAVE_LOADU
61 #define GMX_SIMD_HAVE_STOREU
62 #define GMX_SIMD_HAVE_LOGICAL
63 #undef GMX_SIMD_HAVE_FMA
64 #undef GMX_SIMD_HAVE_FRACTION
65 #define GMX_SIMD_HAVE_FINT32
66 #define GMX_SIMD_HAVE_FINT32_EXTRACT /* Emulated */
67 #undef GMX_SIMD_HAVE_FINT32_LOGICAL /* AVX1 cannot do 256-bit int shifts */
68 #undef GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX1 cannot do 256-bit int +,-,* */
69 #define GMX_SIMD_HAVE_DINT32
70 #define GMX_SIMD_HAVE_DINT32_EXTRACT /* Native, dint uses 128-bit SIMD */
71 #define GMX_SIMD_HAVE_DINT32_LOGICAL
72 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
73 #define GMX_SIMD4_HAVE_FLOAT
74 #define GMX_SIMD4_HAVE_DOUBLE
76 /* Implementation details */
77 #define GMX_SIMD_FLOAT_WIDTH 8
78 #define GMX_SIMD_DOUBLE_WIDTH 4
79 #define GMX_SIMD_FINT32_WIDTH 8
80 #define GMX_SIMD_DINT32_WIDTH 4
81 #define GMX_SIMD_RSQRT_BITS 11
82 #define GMX_SIMD_RCP_BITS 11
84 /****************************************************
85 * SINGLE PRECISION SIMD IMPLEMENTATION *
86 ****************************************************/
87 #define gmx_simd_float_t __m256
88 #define gmx_simd_load_f _mm256_load_ps
89 #define gmx_simd_load1_f _mm256_broadcast_ss
90 #define gmx_simd_set1_f _mm256_set1_ps
91 #define gmx_simd_store_f _mm256_store_ps
92 #define gmx_simd_loadu_f _mm256_loadu_ps
93 #define gmx_simd_storeu_f _mm256_storeu_ps
94 #define gmx_simd_setzero_f _mm256_setzero_ps
95 #define gmx_simd_add_f _mm256_add_ps
96 #define gmx_simd_sub_f _mm256_sub_ps
97 #define gmx_simd_mul_f _mm256_mul_ps
98 #define gmx_simd_fmadd_f(a, b, c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
99 #define gmx_simd_fmsub_f(a, b, c) _mm256_sub_ps(_mm256_mul_ps(a, b), c)
100 #define gmx_simd_fnmadd_f(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
101 #define gmx_simd_fnmsub_f(a, b, c) _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
102 #define gmx_simd_and_f _mm256_and_ps
103 #define gmx_simd_andnot_f _mm256_andnot_ps
104 #define gmx_simd_or_f _mm256_or_ps
105 #define gmx_simd_xor_f _mm256_xor_ps
106 #define gmx_simd_rsqrt_f _mm256_rsqrt_ps
107 #define gmx_simd_rcp_f _mm256_rcp_ps
108 #define gmx_simd_fabs_f(x) _mm256_andnot_ps(_mm256_set1_ps(GMX_FLOAT_NEGZERO), x)
109 #define gmx_simd_fneg_f(x) _mm256_xor_ps(x, _mm256_set1_ps(GMX_FLOAT_NEGZERO))
110 #define gmx_simd_max_f _mm256_max_ps
111 #define gmx_simd_min_f _mm256_min_ps
112 #define gmx_simd_round_f(x) _mm256_round_ps(x, _MM_FROUND_NINT)
113 #define gmx_simd_trunc_f(x) _mm256_round_ps(x, _MM_FROUND_TRUNC)
114 #define gmx_simd_fraction_f(x) _mm256_sub_ps(x, gmx_simd_trunc_f(x))
115 #define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_avx_256
116 #define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_avx_256
117 #define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_avx_256
118 /* integer datatype corresponding to float: gmx_simd_fint32_t */
119 #define gmx_simd_fint32_t __m256i
120 #define gmx_simd_load_fi(m) _mm256_castps_si256(_mm256_load_ps((const float *)m))
121 #define gmx_simd_set1_fi _mm256_set1_epi32
122 #define gmx_simd_store_fi(m, x) _mm256_store_ps((float *)m, _mm256_castsi256_ps(x))
123 #define gmx_simd_loadu_fi(m) _mm256_castps_si256(_mm256_loadu_ps((const float *)m))
124 #define gmx_simd_storeu_fi(m, x) _mm256_storeu_ps((float *)m, _mm256_castsi256_ps(x))
125 #define gmx_simd_setzero_fi _mm256_setzero_si256
126 #define gmx_simd_cvt_f2i _mm256_cvtps_epi32
127 #define gmx_simd_cvtt_f2i _mm256_cvttps_epi32
128 #define gmx_simd_cvt_i2f _mm256_cvtepi32_ps
129 #define gmx_simd_extract_fi(x, i) _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
130 /* Integer logical ops on gmx_simd_fint32_t */
131 /* gmx_simd_add_fi not supported */
132 /* gmx_simd_sub_fi not supported */
133 /* gmx_simd_mul_fi not supported */
134 /* gmx_simd_slli_fi not supported */
135 /* gmx_simd_srli_fi not supported */
136 /* gmx_simd_and_fi not supported */
137 /* gmx_simd_andnot_fi not supported */
138 /* gmx_simd_or_fi not supported */
139 /* gmx_simd_xor_fi not supported */
140 /* Integer arithmetic ops on gmx_simd_fint32_t */
141 /* gmx_simd_add_fi not supported */
142 /* gmx_simd_sub_fi not supported */
143 /* gmx_simd_mul_fi not supported */
144 /* Boolean & comparison operations on gmx_simd_float_t */
145 #define gmx_simd_fbool_t __m256
146 #define gmx_simd_cmpeq_f(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
147 #define gmx_simd_cmplt_f(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
148 #define gmx_simd_cmple_f(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
149 #define gmx_simd_and_fb _mm256_and_ps
150 #define gmx_simd_or_fb _mm256_or_ps
151 #define gmx_simd_anytrue_fb _mm256_movemask_ps
152 #define gmx_simd_blendzero_f _mm256_and_ps
153 #define gmx_simd_blendnotzero_f(a, sel) _mm256_andnot_ps(sel, a)
154 #define gmx_simd_blendv_f _mm256_blendv_ps
155 #define gmx_simd_reduce_f gmx_simd_reduce_f_avx_256
156 /* Boolean & comparison operations on gmx_simd_fint32_t */
157 #define gmx_simd_fibool_t __m256i
158 /* gmx_simd_cmpeq_fi not supported */
159 /* gmx_simd_cmplt_fi not supported */
160 /* gmx_simd_and_fib not supported */
161 /* gmx_simd_or_fib not supported */
162 /* gmx_simd_anytrue_fib not supported */
163 /* gmx_simd_blendzero_fi not supported */
164 /* gmx_simd_blendnotzero_fi not supported */
165 /* gmx_simd_blendv_fi not supported */
166 /* Conversions between different booleans */
167 #define gmx_simd_cvt_fb2fib _mm256_castps_si256
168 #define gmx_simd_cvt_fib2fb _mm256_castsi256_ps
170 /****************************************************
171 * DOUBLE PRECISION SIMD IMPLEMENTATION *
172 ****************************************************/
173 #define gmx_simd_double_t __m256d
174 #define gmx_simd_load_d _mm256_load_pd
175 #define gmx_simd_load1_d _mm256_broadcast_sd
176 #define gmx_simd_set1_d _mm256_set1_pd
177 #define gmx_simd_store_d _mm256_store_pd
178 #define gmx_simd_loadu_d _mm256_loadu_pd
179 #define gmx_simd_storeu_d _mm256_storeu_pd
180 #define gmx_simd_setzero_d _mm256_setzero_pd
181 #define gmx_simd_add_d _mm256_add_pd
182 #define gmx_simd_sub_d _mm256_sub_pd
183 #define gmx_simd_mul_d _mm256_mul_pd
184 #define gmx_simd_fmadd_d(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)
185 #define gmx_simd_fmsub_d(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)
186 #define gmx_simd_fnmadd_d(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
187 #define gmx_simd_fnmsub_d(a, b, c) _mm256_sub_pd(_mm256_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
188 #define gmx_simd_and_d _mm256_and_pd
189 #define gmx_simd_andnot_d _mm256_andnot_pd
190 #define gmx_simd_or_d _mm256_or_pd
191 #define gmx_simd_xor_d _mm256_xor_pd
192 #define gmx_simd_rsqrt_d(x) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
193 #define gmx_simd_rcp_d(x) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x)))
194 #define gmx_simd_fabs_d(x) _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
195 #define gmx_simd_fneg_d(x) _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
196 #define gmx_simd_max_d _mm256_max_pd
197 #define gmx_simd_min_d _mm256_min_pd
198 #define gmx_simd_round_d(x) _mm256_round_pd(x, _MM_FROUND_NINT)
199 #define gmx_simd_trunc_d(x) _mm256_round_pd(x, _MM_FROUND_TRUNC)
200 #define gmx_simd_fraction_d(x) _mm256_sub_pd(x, gmx_simd_trunc_d(x))
201 #define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_avx_256
202 #define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_avx_256
203 #define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_avx_256
204 /* integer datatype corresponding to double: gmx_simd_dint32_t */
205 #define gmx_simd_dint32_t __m128i
206 #define gmx_simd_load_di(m) _mm_load_si128((const __m128i *)m)
207 #define gmx_simd_set1_di _mm_set1_epi32
208 #define gmx_simd_store_di(m, x) _mm_store_si128((__m128i *)m, x)
209 #define gmx_simd_loadu_di(m) _mm_loadu_si128((const __m128i *)m)
210 #define gmx_simd_storeu_di(m, x) _mm_storeu_si128((__m128i *)m, x)
211 #define gmx_simd_setzero_di _mm_setzero_si128
212 #define gmx_simd_cvt_d2i _mm256_cvtpd_epi32
213 #define gmx_simd_cvtt_d2i _mm256_cvttpd_epi32
214 #define gmx_simd_cvt_i2d _mm256_cvtepi32_pd
215 #define gmx_simd_extract_di _mm_extract_epi32
216 /* Integer logical ops on gmx_simd_dint32_t */
217 #define gmx_simd_slli_di _mm_slli_epi32
218 #define gmx_simd_srli_di _mm_srli_epi32
219 #define gmx_simd_and_di _mm_and_si128
220 #define gmx_simd_andnot_di _mm_andnot_si128
221 #define gmx_simd_or_di _mm_or_si128
222 #define gmx_simd_xor_di _mm_xor_si128
223 /* Integer arithmetic ops on integer datatype corresponding to double */
224 #define gmx_simd_add_di _mm_add_epi32
225 #define gmx_simd_sub_di _mm_sub_epi32
226 #define gmx_simd_mul_di _mm_mullo_epi32
227 /* Boolean & comparison operations on gmx_simd_double_t */
228 #define gmx_simd_dbool_t __m256d
229 #define gmx_simd_cmpeq_d(a, b) _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
230 #define gmx_simd_cmplt_d(a, b) _mm256_cmp_pd(a, b, _CMP_LT_OQ)
231 #define gmx_simd_cmple_d(a, b) _mm256_cmp_pd(a, b, _CMP_LE_OQ)
232 #define gmx_simd_and_db _mm256_and_pd
233 #define gmx_simd_or_db _mm256_or_pd
234 #define gmx_simd_anytrue_db _mm256_movemask_pd
235 #define gmx_simd_blendzero_d _mm256_and_pd
236 #define gmx_simd_blendnotzero_d(a, sel) _mm256_andnot_pd(sel, a)
237 #define gmx_simd_blendv_d _mm256_blendv_pd
238 #define gmx_simd_reduce_d gmx_simd_reduce_d_avx_256
239 /* Boolean & comparison operations on gmx_simd_dint32_t */
240 #define gmx_simd_dibool_t __m128i
241 #define gmx_simd_cmpeq_di _mm_cmpeq_epi32
242 #define gmx_simd_cmplt_di _mm_cmplt_epi32
243 #define gmx_simd_and_dib _mm_and_si128
244 #define gmx_simd_or_dib _mm_or_si128
245 #define gmx_simd_anytrue_dib _mm_movemask_epi8
246 #define gmx_simd_blendzero_di _mm_and_si128
247 #define gmx_simd_blendnotzero_di(a, sel) _mm_andnot_si128(sel, a)
248 #define gmx_simd_blendv_di _mm_blendv_epi8
249 /* Conversions between different booleans */
250 #define gmx_simd_cvt_db2dib gmx_simd_cvt_db2dib_avx_256
251 #define gmx_simd_cvt_dib2db gmx_simd_cvt_dib2db_avx_256
252 /* Float/double conversion */
253 #define gmx_simd_cvt_f2dd gmx_simd_cvt_f2dd_avx_256
254 #define gmx_simd_cvt_dd2f gmx_simd_cvt_dd2f_avx_256
256 /****************************************************
257 * SINGLE PRECISION SIMD4 IMPLEMENTATION *
258 ****************************************************/
259 #define gmx_simd4_float_t __m128
260 #define gmx_simd4_load_f _mm_load_ps
261 #define gmx_simd4_load1_f _mm_broadcast_ss
262 #define gmx_simd4_set1_f _mm_set1_ps
263 #define gmx_simd4_store_f _mm_store_ps
264 #define gmx_simd4_loadu_f _mm_loadu_ps
265 #define gmx_simd4_storeu_f _mm_storeu_ps
266 #define gmx_simd4_setzero_f _mm_setzero_ps
267 #define gmx_simd4_add_f _mm_add_ps
268 #define gmx_simd4_sub_f _mm_sub_ps
269 #define gmx_simd4_mul_f _mm_mul_ps
270 #define gmx_simd4_fmadd_f(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
271 #define gmx_simd4_fmsub_f(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c)
272 #define gmx_simd4_fnmadd_f(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
273 #define gmx_simd4_fnmsub_f(a, b, c) _mm_sub_ps(_mm_setzero_ps(), gmx_simd4_fmadd_f(a, b, c))
274 #define gmx_simd4_and_f _mm_and_ps
275 #define gmx_simd4_andnot_f _mm_andnot_ps
276 #define gmx_simd4_or_f _mm_or_ps
277 #define gmx_simd4_xor_f _mm_xor_ps
278 #define gmx_simd4_rsqrt_f _mm_rsqrt_ps
279 #define gmx_simd4_fabs_f(x) _mm_andnot_ps(_mm_set1_ps(-0.0), x)
280 #define gmx_simd4_fneg_f(x) _mm_xor_ps(x, _mm_set1_ps(-0.0))
281 #define gmx_simd4_max_f _mm_max_ps
282 #define gmx_simd4_min_f _mm_min_ps
283 #define gmx_simd4_round_f(x) _mm_round_ps(x, _MM_FROUND_NINT)
284 #define gmx_simd4_trunc_f(x) _mm_round_ps(x, _MM_FROUND_TRUNC)
285 #define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_avx_256
286 #define gmx_simd4_fbool_t __m128
287 #define gmx_simd4_cmpeq_f _mm_cmpeq_ps
288 #define gmx_simd4_cmplt_f _mm_cmplt_ps
289 #define gmx_simd4_cmple_f _mm_cmple_ps
290 #define gmx_simd4_and_fb _mm_and_ps
291 #define gmx_simd4_or_fb _mm_or_ps
292 #define gmx_simd4_anytrue_fb _mm_movemask_ps
293 #define gmx_simd4_blendzero_f _mm_and_ps
294 #define gmx_simd4_blendnotzero_f(a, sel) _mm_andnot_ps(sel, a)
295 #define gmx_simd4_blendv_f _mm_blendv_ps
296 #define gmx_simd4_reduce_f gmx_simd4_reduce_f_avx_256
298 /****************************************************
299 * DOUBLE PRECISION SIMD4 IMPLEMENTATION *
300 ****************************************************/
301 #define gmx_simd4_double_t gmx_simd_double_t
302 #define gmx_simd4_load_d gmx_simd_load_d
303 #define gmx_simd4_load1_d gmx_simd_load1_d
304 #define gmx_simd4_set1_d gmx_simd_set1_d
305 #define gmx_simd4_store_d gmx_simd_store_d
306 #define gmx_simd4_loadu_d gmx_simd_loadu_d
307 #define gmx_simd4_storeu_d gmx_simd_storeu_d
308 #define gmx_simd4_setzero_d gmx_simd_setzero_d
309 #define gmx_simd4_add_d gmx_simd_add_d
310 #define gmx_simd4_sub_d gmx_simd_sub_d
311 #define gmx_simd4_mul_d gmx_simd_mul_d
312 #define gmx_simd4_fmadd_d gmx_simd_fmadd_d
313 #define gmx_simd4_fmsub_d gmx_simd_fmsub_d
314 #define gmx_simd4_fnmadd_d gmx_simd_fnmadd_d
315 #define gmx_simd4_fnmsub_d gmx_simd_fnmsub_d
316 #define gmx_simd4_and_d gmx_simd_and_d
317 #define gmx_simd4_andnot_d gmx_simd_andnot_d
318 #define gmx_simd4_or_d gmx_simd_or_d
319 #define gmx_simd4_xor_d gmx_simd_xor_d
320 #define gmx_simd4_rsqrt_d gmx_simd_rsqrt_d
321 #define gmx_simd4_fabs_d gmx_simd_fabs_d
322 #define gmx_simd4_fneg_d gmx_simd_fneg_d
323 #define gmx_simd4_max_d gmx_simd_max_d
324 #define gmx_simd4_min_d gmx_simd_min_d
325 #define gmx_simd4_round_d gmx_simd_round_d
326 #define gmx_simd4_trunc_d gmx_simd_trunc_d
327 #define gmx_simd4_dotproduct3_d gmx_simd4_dotproduct3_d_avx_256
328 #define gmx_simd4_dbool_t gmx_simd_dbool_t
329 #define gmx_simd4_cmpeq_d gmx_simd_cmpeq_d
330 #define gmx_simd4_cmplt_d gmx_simd_cmplt_d
331 #define gmx_simd4_cmple_d gmx_simd_cmple_d
332 #define gmx_simd4_and_db gmx_simd_and_db
333 #define gmx_simd4_or_db gmx_simd_or_db
334 #define gmx_simd4_anytrue_db gmx_simd_anytrue_db
335 #define gmx_simd4_blendzero_d gmx_simd_blendzero_d
336 #define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
337 #define gmx_simd4_blendv_d gmx_simd_blendv_d
338 #define gmx_simd4_reduce_d gmx_simd_reduce_d
339 /* SIMD4 float/double conversion */
340 #define gmx_simd4_cvt_f2d _mm256_cvtps_pd
341 #define gmx_simd4_cvt_d2f _mm256_cvtpd_ps
343 /*********************************************************
344 * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
345 *********************************************************/
346 static gmx_inline __m256 gmx_simdcall
347 gmx_simd_get_exponent_f_avx_256(__m256 x)
349 const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
350 const __m128i expbias = _mm_set1_epi32(127);
352 __m128i iexp128a, iexp128b;
354 iexp256 = _mm256_castps_si256(_mm256_and_ps(x, expmask));
355 iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
356 iexp128a = _mm256_castsi256_si128(iexp256);
357 iexp128a = _mm_srli_epi32(iexp128a, 23);
358 iexp128b = _mm_srli_epi32(iexp128b, 23);
359 iexp128a = _mm_sub_epi32(iexp128a, expbias);
360 iexp128b = _mm_sub_epi32(iexp128b, expbias);
361 iexp256 = _mm256_castsi128_si256(iexp128a);
362 iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
363 return _mm256_cvtepi32_ps(iexp256);
366 static gmx_inline __m256 gmx_simdcall
367 gmx_simd_get_mantissa_f_avx_256(__m256 x)
369 const __m256 mantmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
370 const __m256 one = _mm256_set1_ps(1.0);
372 x = _mm256_and_ps(x, mantmask);
373 return _mm256_or_ps(x, one);
376 static gmx_inline __m256 gmx_simdcall
377 gmx_simd_set_exponent_f_avx_256(__m256 x)
379 const __m128i expbias = _mm_set1_epi32(127);
381 __m128i iexp128a, iexp128b;
383 iexp256 = _mm256_cvtps_epi32(x);
384 iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
385 iexp128a = _mm256_castsi256_si128(iexp256);
386 iexp128a = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
387 iexp128b = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
388 iexp256 = _mm256_castsi128_si256(iexp128a);
389 iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
390 return _mm256_castsi256_ps(iexp256);
393 static gmx_inline float gmx_simdcall
394 gmx_simd_reduce_f_avx_256(__m256 a)
399 a = _mm256_hadd_ps(a, a);
400 a = _mm256_hadd_ps(a, a);
401 a0 = _mm256_castps256_ps128(a);
402 a1 = _mm256_extractf128_ps(a, 0x1);
403 a0 = _mm_add_ss(a0, a1);
404 _mm_store_ss(&f, a0);
408 /*********************************************************
409 * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
410 *********************************************************/
411 static gmx_inline __m256d gmx_simdcall
412 gmx_simd_get_exponent_d_avx_256(__m256d x)
414 const __m256d expmask = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
415 const __m128i expbias = _mm_set1_epi32(1023);
417 __m128i iexp128a, iexp128b;
419 iexp256 = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
420 iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
421 iexp128a = _mm256_castsi256_si128(iexp256);
422 iexp128a = _mm_srli_epi64(iexp128a, 52);
423 iexp128b = _mm_srli_epi64(iexp128b, 52);
424 iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
425 iexp128b = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
426 iexp128a = _mm_or_si128(iexp128a, iexp128b);
427 iexp128a = _mm_sub_epi32(iexp128a, expbias);
428 return _mm256_cvtepi32_pd(iexp128a);
431 static gmx_inline __m256d gmx_simdcall
432 gmx_simd_get_mantissa_d_avx_256(__m256d x)
434 const __m256d mantmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
435 const __m256d one = _mm256_set1_pd(1.0);
437 x = _mm256_and_pd(x, mantmask);
438 return _mm256_or_pd(x, one);
441 static gmx_inline __m256d gmx_simdcall
442 gmx_simd_set_exponent_d_avx_256(__m256d x)
444 const __m128i expbias = _mm_set1_epi32(1023);
445 __m128i iexp128a, iexp128b;
447 iexp128a = _mm256_cvtpd_epi32(x);
448 iexp128a = _mm_add_epi32(iexp128a, expbias);
449 iexp128b = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(3, 3, 2, 2));
450 iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 0, 0));
451 iexp128b = _mm_slli_epi64(iexp128b, 52);
452 iexp128a = _mm_slli_epi64(iexp128a, 52);
453 return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
456 static gmx_inline double gmx_simdcall
457 gmx_simd_reduce_d_avx_256(__m256d a)
461 a = _mm256_hadd_pd(a, a);
462 a0 = _mm256_castpd256_pd128(a);
463 a1 = _mm256_extractf128_pd(a, 0x1);
464 a0 = _mm_add_sd(a0, a1);
465 _mm_store_sd(&f, a0);
469 static gmx_inline gmx_simd_dibool_t gmx_simdcall
470 gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
472 __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
473 __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a));
474 a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
475 a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
476 return _mm_blend_epi16(a0, a1, 0xF0);
479 static gmx_inline gmx_simd_dbool_t gmx_simdcall
480 gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
482 __m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
483 __m128i a0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
484 return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
487 static gmx_inline void gmx_simdcall
488 gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
490 *d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
491 *d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
494 static gmx_inline __m256 gmx_simdcall
495 gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
497 __m128 f0 = _mm256_cvtpd_ps(d0);
498 __m128 f1 = _mm256_cvtpd_ps(d1);
499 return _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1);
502 /* SIMD4 reduce helper */
503 static gmx_inline float gmx_simdcall
504 gmx_simd4_reduce_f_avx_256(__m128 a)
507 a = _mm_hadd_ps(a, a);
508 a = _mm_hadd_ps(a, a);
513 /* SIMD4 Dotproduct helper function */
514 static gmx_inline float gmx_simdcall
515 gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
519 a = _mm_mul_ps(a, b);
520 c = _mm_add_ps(a, _mm_permute_ps(a, _MM_SHUFFLE(0, 3, 2, 1)));
521 c = _mm_add_ps(c, _mm_permute_ps(a, _MM_SHUFFLE(1, 0, 3, 2)));
526 static gmx_inline double gmx_simdcall
527 gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
531 a = _mm256_mul_pd(a, b);
532 tmp1 = _mm256_castpd256_pd128(a);
533 tmp2 = _mm256_extractf128_pd(a, 0x1);
535 tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
536 tmp1 = _mm_add_pd(tmp1, tmp2);
537 _mm_store_sd(&d, tmp1);
541 /* Function to check whether SIMD operations have resulted in overflow */
543 gmx_simd_check_and_reset_overflow(void)
548 MXCSR = _mm_getcsr();
549 /* The overflow flag is bit 3 in the register */
553 /* Set the overflow flag to zero */
554 MXCSR = MXCSR & 0xFFF7;
565 #endif /* GMX_SIMD_IMPL_X86_AVX_256_H */