1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_avx_256_single_h_
22 #define _gmx_math_x86_avx_256_single_h_
26 #include "gmx_x86_avx_256.h"
29 # define M_PI 3.14159265358979323846264338327950288
34 /************************
36 * Simple math routines *
38 ************************/
40 /* 1.0/sqrt(x), 256-bit wide version */
41 static gmx_inline __m256
42 gmx_mm256_invsqrt_ps(__m256 x)
44 const __m256 half = _mm256_set1_ps(0.5f);
45 const __m256 three = _mm256_set1_ps(3.0f);
47 __m256 lu = _mm256_rsqrt_ps(x);
49 return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
52 /* 1.0/sqrt(x), 128-bit wide version */
53 static gmx_inline __m128
54 gmx_mm_invsqrt_ps(__m128 x)
56 const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
57 const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
59 __m128 lu = _mm_rsqrt_ps(x);
61 return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
65 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
66 static gmx_inline __m256
67 gmx_mm256_sqrt_ps(__m256 x)
72 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
73 res = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
75 res = _mm256_mul_ps(x, res);
80 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
81 static gmx_inline __m128
82 gmx_mm_sqrt_ps(__m128 x)
87 mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
88 res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
90 res = _mm_mul_ps(x, res);
96 /* 1.0/x, 256-bit wide */
97 static gmx_inline __m256
98 gmx_mm256_inv_ps(__m256 x)
100 const __m256 two = _mm256_set1_ps(2.0f);
102 __m256 lu = _mm256_rcp_ps(x);
104 return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
107 /* 1.0/x, 128-bit wide */
108 static gmx_inline __m128
109 gmx_mm_inv_ps(__m128 x)
111 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
113 __m128 lu = _mm_rcp_ps(x);
115 return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
119 static gmx_inline __m256
120 gmx_mm256_abs_ps(__m256 x)
122 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
124 return _mm256_and_ps(x, signmask);
127 static gmx_inline __m128
128 gmx_mm_abs_ps(__m128 x)
130 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
132 return _mm_and_ps(x, signmask);
137 gmx_mm256_log_ps(__m256 x)
139 const __m256 expmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
140 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
141 const __m256 half = _mm256_set1_ps(0.5f);
142 const __m256 one = _mm256_set1_ps(1.0f);
143 const __m256 invsq2 = _mm256_set1_ps(1.0f/sqrt(2.0f));
144 const __m256 corr1 = _mm256_set1_ps(-2.12194440e-4f);
145 const __m256 corr2 = _mm256_set1_ps(0.693359375f);
147 const __m256 CA_1 = _mm256_set1_ps(0.070376836292f);
148 const __m256 CB_0 = _mm256_set1_ps(1.6714950086782716f);
149 const __m256 CB_1 = _mm256_set1_ps(-2.452088066061482f);
150 const __m256 CC_0 = _mm256_set1_ps(1.5220770854701728f);
151 const __m256 CC_1 = _mm256_set1_ps(-1.3422238433233642f);
152 const __m256 CD_0 = _mm256_set1_ps(1.386218787509749f);
153 const __m256 CD_1 = _mm256_set1_ps(0.35075468953796346f);
154 const __m256 CE_0 = _mm256_set1_ps(1.3429983063133937f);
155 const __m256 CE_1 = _mm256_set1_ps(1.807420826584643f);
159 __m128i iexp128a, iexp128b;
162 __m128i imask128a, imask128b;
165 __m256 pA, pB, pC, pD, pE, tB, tC, tD, tE;
167 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
168 fexp = _mm256_and_ps(x, expmask);
169 iexp = _mm256_castps_si256(fexp);
171 iexp128b = _mm256_extractf128_si256(iexp, 0x1);
172 iexp128a = _mm256_castsi256_si128(iexp);
174 iexp128a = _mm_srli_epi32(iexp128a, 23);
175 iexp128b = _mm_srli_epi32(iexp128b, 23);
176 iexp128a = _mm_sub_epi32(iexp128a, expbase_m1);
177 iexp128b = _mm_sub_epi32(iexp128b, expbase_m1);
179 x = _mm256_andnot_ps(expmask, x);
180 x = _mm256_or_ps(x, one);
181 x = _mm256_mul_ps(x, half);
183 mask = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
185 x = _mm256_add_ps(x, _mm256_and_ps(mask, x));
186 x = _mm256_sub_ps(x, one);
188 imask = _mm256_castps_si256(mask);
190 imask128b = _mm256_extractf128_si256(imask, 0x1);
191 imask128a = _mm256_castsi256_si128(imask);
193 iexp128a = _mm_add_epi32(iexp128a, imask128a);
194 iexp128b = _mm_add_epi32(iexp128b, imask128b);
196 iexp = _mm256_castsi128_si256(iexp128a);
197 iexp = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
199 x2 = _mm256_mul_ps(x, x);
201 pA = _mm256_mul_ps(CA_1, x);
202 pB = _mm256_mul_ps(CB_1, x);
203 pC = _mm256_mul_ps(CC_1, x);
204 pD = _mm256_mul_ps(CD_1, x);
205 pE = _mm256_mul_ps(CE_1, x);
206 tB = _mm256_add_ps(CB_0, x2);
207 tC = _mm256_add_ps(CC_0, x2);
208 tD = _mm256_add_ps(CD_0, x2);
209 tE = _mm256_add_ps(CE_0, x2);
210 pB = _mm256_add_ps(pB, tB);
211 pC = _mm256_add_ps(pC, tC);
212 pD = _mm256_add_ps(pD, tD);
213 pE = _mm256_add_ps(pE, tE);
215 pA = _mm256_mul_ps(pA, pB);
216 pC = _mm256_mul_ps(pC, pD);
217 pE = _mm256_mul_ps(pE, x2);
218 pA = _mm256_mul_ps(pA, pC);
219 y = _mm256_mul_ps(pA, pE);
221 fexp = _mm256_cvtepi32_ps(iexp);
222 y = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
224 y = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
225 x2 = _mm256_add_ps(x, y);
227 x2 = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
234 gmx_mm_log_ps(__m128 x)
236 /* Same algorithm as cephes library */
237 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
238 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
239 const __m128 half = _mm_set1_ps(0.5f);
240 const __m128 one = _mm_set1_ps(1.0f);
241 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
242 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
243 const __m128 corr2 = _mm_set1_ps(0.693359375f);
245 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
246 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
247 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
248 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
249 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
250 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
251 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
252 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
253 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
260 __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
262 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
263 fexp = _mm_and_ps(x, expmask);
264 iexp = gmx_mm_castps_si128(fexp);
265 iexp = _mm_srli_epi32(iexp, 23);
266 iexp = _mm_sub_epi32(iexp, expbase_m1);
268 x = _mm_andnot_ps(expmask, x);
269 x = _mm_or_ps(x, one);
270 x = _mm_mul_ps(x, half);
272 mask = _mm_cmplt_ps(x, invsq2);
274 x = _mm_add_ps(x, _mm_and_ps(mask, x));
275 x = _mm_sub_ps(x, one);
276 iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
278 x2 = _mm_mul_ps(x, x);
280 pA = _mm_mul_ps(CA_1, x);
281 pB = _mm_mul_ps(CB_1, x);
282 pC = _mm_mul_ps(CC_1, x);
283 pD = _mm_mul_ps(CD_1, x);
284 pE = _mm_mul_ps(CE_1, x);
285 tB = _mm_add_ps(CB_0, x2);
286 tC = _mm_add_ps(CC_0, x2);
287 tD = _mm_add_ps(CD_0, x2);
288 tE = _mm_add_ps(CE_0, x2);
289 pB = _mm_add_ps(pB, tB);
290 pC = _mm_add_ps(pC, tC);
291 pD = _mm_add_ps(pD, tD);
292 pE = _mm_add_ps(pE, tE);
294 pA = _mm_mul_ps(pA, pB);
295 pC = _mm_mul_ps(pC, pD);
296 pE = _mm_mul_ps(pE, x2);
297 pA = _mm_mul_ps(pA, pC);
298 y = _mm_mul_ps(pA, pE);
300 fexp = _mm_cvtepi32_ps(iexp);
301 y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
303 y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
304 x2 = _mm_add_ps(x, y);
306 x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
313 * 2^x function, 256-bit wide
315 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
316 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
318 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
320 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
321 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
322 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
323 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
324 * number instead. That would take a few extra cycles and not really help, since something is
325 * wrong if you are using single precision to work with numbers that cannot really be represented
326 * in single precision.
328 * The accuracy is at least 23 bits.
331 gmx_mm256_exp2_ps(__m256 x)
333 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
334 const __m256 arglimit = _mm256_set1_ps(126.0f);
336 const __m128i expbase = _mm_set1_epi32(127);
337 const __m256 CC6 = _mm256_set1_ps(1.535336188319500E-004);
338 const __m256 CC5 = _mm256_set1_ps(1.339887440266574E-003);
339 const __m256 CC4 = _mm256_set1_ps(9.618437357674640E-003);
340 const __m256 CC3 = _mm256_set1_ps(5.550332471162809E-002);
341 const __m256 CC2 = _mm256_set1_ps(2.402264791363012E-001);
342 const __m256 CC1 = _mm256_set1_ps(6.931472028550421E-001);
343 const __m256 CC0 = _mm256_set1_ps(1.0f);
348 __m128i iexppart128a, iexppart128b;
354 iexppart = _mm256_cvtps_epi32(x);
355 intpart = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
357 iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
358 iexppart128a = _mm256_castsi256_si128(iexppart);
360 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
361 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
363 iexppart = _mm256_castsi128_si256(iexppart128a);
364 iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
365 valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
366 fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
368 x = _mm256_sub_ps(x, intpart);
369 x2 = _mm256_mul_ps(x, x);
371 p0 = _mm256_mul_ps(CC6, x2);
372 p1 = _mm256_mul_ps(CC5, x2);
373 p0 = _mm256_add_ps(p0, CC4);
374 p1 = _mm256_add_ps(p1, CC3);
375 p0 = _mm256_mul_ps(p0, x2);
376 p1 = _mm256_mul_ps(p1, x2);
377 p0 = _mm256_add_ps(p0, CC2);
378 p1 = _mm256_add_ps(p1, CC1);
379 p0 = _mm256_mul_ps(p0, x2);
380 p1 = _mm256_mul_ps(p1, x);
381 p0 = _mm256_add_ps(p0, CC0);
382 p0 = _mm256_add_ps(p0, p1);
383 x = _mm256_mul_ps(p0, fexppart);
389 /* 2^x, 128 bit wide */
391 gmx_mm_exp2_ps(__m128 x)
393 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
394 const __m128 arglimit = _mm_set1_ps(126.0f);
396 const __m128i expbase = _mm_set1_epi32(127);
397 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
398 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
399 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
400 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
401 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
402 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
403 const __m128 CA0 = _mm_set1_ps(1.0f);
412 iexppart = _mm_cvtps_epi32(x);
413 intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
414 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
415 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
416 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
418 x = _mm_sub_ps(x, intpart);
419 x2 = _mm_mul_ps(x, x);
421 p0 = _mm_mul_ps(CA6, x2);
422 p1 = _mm_mul_ps(CA5, x2);
423 p0 = _mm_add_ps(p0, CA4);
424 p1 = _mm_add_ps(p1, CA3);
425 p0 = _mm_mul_ps(p0, x2);
426 p1 = _mm_mul_ps(p1, x2);
427 p0 = _mm_add_ps(p0, CA2);
428 p1 = _mm_add_ps(p1, CA1);
429 p0 = _mm_mul_ps(p0, x2);
430 p1 = _mm_mul_ps(p1, x);
431 p0 = _mm_add_ps(p0, CA0);
432 p0 = _mm_add_ps(p0, p1);
433 x = _mm_mul_ps(p0, fexppart);
439 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
440 * where y=log2(e)*x, but there will then be a small rounding error since we lose some
441 * precision due to the multiplication. This will then be magnified a lot by the exponential.
443 * Instead, we calculate the fractional part directly as a minimax approximation of
444 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
445 * remaining after 2^y, which avoids the precision-loss.
446 * The final result is correct to within 1 LSB over the entire argument range.
449 gmx_mm256_exp_ps(__m256 exparg)
451 const __m256 argscale = _mm256_set1_ps(1.44269504088896341f);
452 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
453 const __m256 arglimit = _mm256_set1_ps(126.0f);
454 const __m128i expbase = _mm_set1_epi32(127);
456 const __m256 invargscale0 = _mm256_set1_ps(0.693359375f);
457 const __m256 invargscale1 = _mm256_set1_ps(-2.12194440e-4f);
459 const __m256 CE5 = _mm256_set1_ps(1.9875691500e-4f);
460 const __m256 CE4 = _mm256_set1_ps(1.3981999507e-3f);
461 const __m256 CE3 = _mm256_set1_ps(8.3334519073e-3f);
462 const __m256 CE2 = _mm256_set1_ps(4.1665795894e-2f);
463 const __m256 CE1 = _mm256_set1_ps(1.6666665459e-1f);
464 const __m256 CE0 = _mm256_set1_ps(5.0000001201e-1f);
465 const __m256 one = _mm256_set1_ps(1.0f);
467 __m256 exparg2, exp2arg;
471 __m128i iexppart128a, iexppart128b;
475 exp2arg = _mm256_mul_ps(exparg, argscale);
477 iexppart = _mm256_cvtps_epi32(exp2arg);
478 intpart = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
480 iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
481 iexppart128a = _mm256_castsi256_si128(iexppart);
483 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
484 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
486 iexppart = _mm256_castsi128_si256(iexppart128a);
487 iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
488 valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
489 fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
491 /* Extended precision arithmetics */
492 exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
493 exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
495 exparg2 = _mm256_mul_ps(exparg, exparg);
497 pE1 = _mm256_mul_ps(CE5, exparg2);
498 pE0 = _mm256_mul_ps(CE4, exparg2);
499 pE1 = _mm256_add_ps(pE1, CE3);
500 pE0 = _mm256_add_ps(pE0, CE2);
501 pE1 = _mm256_mul_ps(pE1, exparg2);
502 pE0 = _mm256_mul_ps(pE0, exparg2);
503 pE1 = _mm256_add_ps(pE1, CE1);
504 pE0 = _mm256_add_ps(pE0, CE0);
505 pE1 = _mm256_mul_ps(pE1, exparg);
506 pE0 = _mm256_add_ps(pE0, pE1);
507 pE0 = _mm256_mul_ps(pE0, exparg2);
508 exparg = _mm256_add_ps(exparg, one);
509 exparg = _mm256_add_ps(exparg, pE0);
511 exparg = _mm256_mul_ps(exparg, fexppart);
517 /* exp(), 128 bit wide. */
519 gmx_mm_exp_ps(__m128 x)
521 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
522 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
523 const __m128 arglimit = _mm_set1_ps(126.0f);
524 const __m128i expbase = _mm_set1_epi32(127);
526 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
527 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
529 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
530 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
531 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
532 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
533 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
534 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
535 const __m128 one = _mm_set1_ps(1.0f);
544 y = _mm_mul_ps(x, argscale);
546 iexppart = _mm_cvtps_epi32(y);
547 intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
549 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
550 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
551 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
553 /* Extended precision arithmetics */
554 x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
555 x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
557 x2 = _mm_mul_ps(x, x);
559 p1 = _mm_mul_ps(CC5, x2);
560 p0 = _mm_mul_ps(CC4, x2);
561 p1 = _mm_add_ps(p1, CC3);
562 p0 = _mm_add_ps(p0, CC2);
563 p1 = _mm_mul_ps(p1, x2);
564 p0 = _mm_mul_ps(p0, x2);
565 p1 = _mm_add_ps(p1, CC1);
566 p0 = _mm_add_ps(p0, CC0);
567 p1 = _mm_mul_ps(p1, x);
568 p0 = _mm_add_ps(p0, p1);
569 p0 = _mm_mul_ps(p0, x2);
570 x = _mm_add_ps(x, one);
571 x = _mm_add_ps(x, p0);
573 x = _mm_mul_ps(x, fexppart);
580 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
582 gmx_mm256_erf_ps(__m256 x)
584 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
585 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
586 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
587 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
588 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
589 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
590 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
591 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
592 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
593 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
594 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
595 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
596 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
597 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
598 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
599 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
600 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
601 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
602 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
603 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
604 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
605 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
606 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
607 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
608 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
609 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
610 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
611 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
612 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
613 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
614 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
616 /* Coefficients for expansion of exp(x) in [0,0.1] */
617 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
618 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
619 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
620 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
622 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
623 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
624 const __m256 one = _mm256_set1_ps(1.0f);
625 const __m256 two = _mm256_set1_ps(2.0f);
628 __m256 z, q, t, t2, w, w2;
629 __m256 pA0, pA1, pB0, pB1, pC0, pC1;
631 __m256 res_erf, res_erfc, res;
634 /* Calculate erf() */
635 x2 = _mm256_mul_ps(x, x);
636 x4 = _mm256_mul_ps(x2, x2);
638 pA0 = _mm256_mul_ps(CA6, x4);
639 pA1 = _mm256_mul_ps(CA5, x4);
640 pA0 = _mm256_add_ps(pA0, CA4);
641 pA1 = _mm256_add_ps(pA1, CA3);
642 pA0 = _mm256_mul_ps(pA0, x4);
643 pA1 = _mm256_mul_ps(pA1, x4);
644 pA0 = _mm256_add_ps(pA0, CA2);
645 pA1 = _mm256_add_ps(pA1, CA1);
646 pA0 = _mm256_mul_ps(pA0, x4);
647 pA1 = _mm256_mul_ps(pA1, x2);
648 pA0 = _mm256_add_ps(pA0, pA1);
649 pA0 = _mm256_add_ps(pA0, CA0);
651 res_erf = _mm256_mul_ps(x, pA0);
655 y = gmx_mm256_abs_ps(x);
656 t = gmx_mm256_inv_ps(y);
657 w = _mm256_sub_ps(t, one);
658 t2 = _mm256_mul_ps(t, t);
659 w2 = _mm256_mul_ps(w, w);
661 * We cannot simply calculate exp(-x2) directly in single precision, since
662 * that will lose a couple of bits of precision due to the multiplication.
663 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
664 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
666 * The only drawback with this is that it requires TWO separate exponential
667 * evaluations, which would be horrible performance-wise. However, the argument
668 * for the second exp() call is always small, so there we simply use a
669 * low-order minimax expansion on [0,0.1].
672 z = _mm256_and_ps(y, sieve);
673 q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
675 corr = _mm256_mul_ps(CD4, q);
676 corr = _mm256_add_ps(corr, CD3);
677 corr = _mm256_mul_ps(corr, q);
678 corr = _mm256_add_ps(corr, CD2);
679 corr = _mm256_mul_ps(corr, q);
680 corr = _mm256_add_ps(corr, one);
681 corr = _mm256_mul_ps(corr, q);
682 corr = _mm256_add_ps(corr, one);
684 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
685 expmx2 = _mm256_mul_ps(expmx2, corr);
687 pB1 = _mm256_mul_ps(CB9, w2);
688 pB0 = _mm256_mul_ps(CB8, w2);
689 pB1 = _mm256_add_ps(pB1, CB7);
690 pB0 = _mm256_add_ps(pB0, CB6);
691 pB1 = _mm256_mul_ps(pB1, w2);
692 pB0 = _mm256_mul_ps(pB0, w2);
693 pB1 = _mm256_add_ps(pB1, CB5);
694 pB0 = _mm256_add_ps(pB0, CB4);
695 pB1 = _mm256_mul_ps(pB1, w2);
696 pB0 = _mm256_mul_ps(pB0, w2);
697 pB1 = _mm256_add_ps(pB1, CB3);
698 pB0 = _mm256_add_ps(pB0, CB2);
699 pB1 = _mm256_mul_ps(pB1, w2);
700 pB0 = _mm256_mul_ps(pB0, w2);
701 pB1 = _mm256_add_ps(pB1, CB1);
702 pB1 = _mm256_mul_ps(pB1, w);
703 pB0 = _mm256_add_ps(pB0, pB1);
704 pB0 = _mm256_add_ps(pB0, CB0);
706 pC0 = _mm256_mul_ps(CC10, t2);
707 pC1 = _mm256_mul_ps(CC9, t2);
708 pC0 = _mm256_add_ps(pC0, CC8);
709 pC1 = _mm256_add_ps(pC1, CC7);
710 pC0 = _mm256_mul_ps(pC0, t2);
711 pC1 = _mm256_mul_ps(pC1, t2);
712 pC0 = _mm256_add_ps(pC0, CC6);
713 pC1 = _mm256_add_ps(pC1, CC5);
714 pC0 = _mm256_mul_ps(pC0, t2);
715 pC1 = _mm256_mul_ps(pC1, t2);
716 pC0 = _mm256_add_ps(pC0, CC4);
717 pC1 = _mm256_add_ps(pC1, CC3);
718 pC0 = _mm256_mul_ps(pC0, t2);
719 pC1 = _mm256_mul_ps(pC1, t2);
720 pC0 = _mm256_add_ps(pC0, CC2);
721 pC1 = _mm256_add_ps(pC1, CC1);
722 pC0 = _mm256_mul_ps(pC0, t2);
723 pC1 = _mm256_mul_ps(pC1, t);
724 pC0 = _mm256_add_ps(pC0, pC1);
725 pC0 = _mm256_add_ps(pC0, CC0);
726 pC0 = _mm256_mul_ps(pC0, t);
728 /* SELECT pB0 or pC0 for erfc() */
729 mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
730 res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
731 res_erfc = _mm256_mul_ps(res_erfc, expmx2);
733 /* erfc(x<0) = 2-erfc(|x|) */
734 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
735 res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
737 /* Select erf() or erfc() */
738 mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
739 res = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
745 /* erf(), 128 bit wide */
747 gmx_mm_erf_ps(__m128 x)
749 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
750 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
751 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
752 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
753 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
754 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
755 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
756 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
757 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
758 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
759 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
760 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
761 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
762 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
763 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
764 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
765 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
766 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
767 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
768 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
769 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
770 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
771 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
772 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
773 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
774 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
775 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
776 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
777 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
778 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
779 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
781 /* Coefficients for expansion of exp(x) in [0,0.1] */
782 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
783 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
784 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
785 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
787 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
788 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
789 const __m128 one = _mm_set1_ps(1.0f);
790 const __m128 two = _mm_set1_ps(2.0f);
793 __m128 z, q, t, t2, w, w2;
794 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
796 __m128 res_erf, res_erfc, res;
799 /* Calculate erf() */
800 x2 = _mm_mul_ps(x, x);
801 x4 = _mm_mul_ps(x2, x2);
803 pA0 = _mm_mul_ps(CA6, x4);
804 pA1 = _mm_mul_ps(CA5, x4);
805 pA0 = _mm_add_ps(pA0, CA4);
806 pA1 = _mm_add_ps(pA1, CA3);
807 pA0 = _mm_mul_ps(pA0, x4);
808 pA1 = _mm_mul_ps(pA1, x4);
809 pA0 = _mm_add_ps(pA0, CA2);
810 pA1 = _mm_add_ps(pA1, CA1);
811 pA0 = _mm_mul_ps(pA0, x4);
812 pA1 = _mm_mul_ps(pA1, x2);
813 pA0 = _mm_add_ps(pA0, pA1);
814 pA0 = _mm_add_ps(pA0, CA0);
816 res_erf = _mm_mul_ps(x, pA0);
820 y = gmx_mm_abs_ps(x);
821 t = gmx_mm_inv_ps(y);
822 w = _mm_sub_ps(t, one);
823 t2 = _mm_mul_ps(t, t);
824 w2 = _mm_mul_ps(w, w);
826 * We cannot simply calculate exp(-x2) directly in single precision, since
827 * that will lose a couple of bits of precision due to the multiplication.
828 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
829 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
831 * The only drawback with this is that it requires TWO separate exponential
832 * evaluations, which would be horrible performance-wise. However, the argument
833 * for the second exp() call is always small, so there we simply use a
834 * low-order minimax expansion on [0,0.1].
837 z = _mm_and_ps(y, sieve);
838 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
840 corr = _mm_mul_ps(CD4, q);
841 corr = _mm_add_ps(corr, CD3);
842 corr = _mm_mul_ps(corr, q);
843 corr = _mm_add_ps(corr, CD2);
844 corr = _mm_mul_ps(corr, q);
845 corr = _mm_add_ps(corr, one);
846 corr = _mm_mul_ps(corr, q);
847 corr = _mm_add_ps(corr, one);
849 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
850 expmx2 = _mm_mul_ps(expmx2, corr);
852 pB1 = _mm_mul_ps(CB9, w2);
853 pB0 = _mm_mul_ps(CB8, w2);
854 pB1 = _mm_add_ps(pB1, CB7);
855 pB0 = _mm_add_ps(pB0, CB6);
856 pB1 = _mm_mul_ps(pB1, w2);
857 pB0 = _mm_mul_ps(pB0, w2);
858 pB1 = _mm_add_ps(pB1, CB5);
859 pB0 = _mm_add_ps(pB0, CB4);
860 pB1 = _mm_mul_ps(pB1, w2);
861 pB0 = _mm_mul_ps(pB0, w2);
862 pB1 = _mm_add_ps(pB1, CB3);
863 pB0 = _mm_add_ps(pB0, CB2);
864 pB1 = _mm_mul_ps(pB1, w2);
865 pB0 = _mm_mul_ps(pB0, w2);
866 pB1 = _mm_add_ps(pB1, CB1);
867 pB1 = _mm_mul_ps(pB1, w);
868 pB0 = _mm_add_ps(pB0, pB1);
869 pB0 = _mm_add_ps(pB0, CB0);
871 pC0 = _mm_mul_ps(CC10, t2);
872 pC1 = _mm_mul_ps(CC9, t2);
873 pC0 = _mm_add_ps(pC0, CC8);
874 pC1 = _mm_add_ps(pC1, CC7);
875 pC0 = _mm_mul_ps(pC0, t2);
876 pC1 = _mm_mul_ps(pC1, t2);
877 pC0 = _mm_add_ps(pC0, CC6);
878 pC1 = _mm_add_ps(pC1, CC5);
879 pC0 = _mm_mul_ps(pC0, t2);
880 pC1 = _mm_mul_ps(pC1, t2);
881 pC0 = _mm_add_ps(pC0, CC4);
882 pC1 = _mm_add_ps(pC1, CC3);
883 pC0 = _mm_mul_ps(pC0, t2);
884 pC1 = _mm_mul_ps(pC1, t2);
885 pC0 = _mm_add_ps(pC0, CC2);
886 pC1 = _mm_add_ps(pC1, CC1);
887 pC0 = _mm_mul_ps(pC0, t2);
888 pC1 = _mm_mul_ps(pC1, t);
889 pC0 = _mm_add_ps(pC0, pC1);
890 pC0 = _mm_add_ps(pC0, CC0);
891 pC0 = _mm_mul_ps(pC0, t);
893 /* SELECT pB0 or pC0 for erfc() */
894 mask = _mm_cmplt_ps(two, y);
895 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
896 res_erfc = _mm_mul_ps(res_erfc, expmx2);
898 /* erfc(x<0) = 2-erfc(|x|) */
899 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
900 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
902 /* Select erf() or erfc() */
903 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
904 res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
912 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
914 gmx_mm256_erfc_ps(__m256 x)
916 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
917 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
918 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
919 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
920 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
921 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
922 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
923 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
924 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
925 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
926 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
927 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
928 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
929 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
930 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
931 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
932 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
933 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
934 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
935 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
936 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
937 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
938 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
939 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
940 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
941 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
942 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
943 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
944 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
945 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
946 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
948 /* Coefficients for expansion of exp(x) in [0,0.1] */
949 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
950 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
951 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
952 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
954 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
955 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
956 const __m256 one = _mm256_set1_ps(1.0f);
957 const __m256 two = _mm256_set1_ps(2.0f);
960 __m256 z, q, t, t2, w, w2;
961 __m256 pA0, pA1, pB0, pB1, pC0, pC1;
963 __m256 res_erf, res_erfc, res;
966 /* Calculate erf() */
967 x2 = _mm256_mul_ps(x, x);
968 x4 = _mm256_mul_ps(x2, x2);
970 pA0 = _mm256_mul_ps(CA6, x4);
971 pA1 = _mm256_mul_ps(CA5, x4);
972 pA0 = _mm256_add_ps(pA0, CA4);
973 pA1 = _mm256_add_ps(pA1, CA3);
974 pA0 = _mm256_mul_ps(pA0, x4);
975 pA1 = _mm256_mul_ps(pA1, x4);
976 pA0 = _mm256_add_ps(pA0, CA2);
977 pA1 = _mm256_add_ps(pA1, CA1);
978 pA0 = _mm256_mul_ps(pA0, x4);
979 pA1 = _mm256_mul_ps(pA1, x2);
980 pA0 = _mm256_add_ps(pA0, pA1);
981 pA0 = _mm256_add_ps(pA0, CA0);
983 res_erf = _mm256_mul_ps(x, pA0);
986 y = gmx_mm256_abs_ps(x);
987 t = gmx_mm256_inv_ps(y);
988 w = _mm256_sub_ps(t, one);
989 t2 = _mm256_mul_ps(t, t);
990 w2 = _mm256_mul_ps(w, w);
992 * We cannot simply calculate exp(-x2) directly in single precision, since
993 * that will lose a couple of bits of precision due to the multiplication.
994 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
995 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
997 * The only drawback with this is that it requires TWO separate exponential
998 * evaluations, which would be horrible performance-wise. However, the argument
999 * for the second exp() call is always small, so there we simply use a
1000 * low-order minimax expansion on [0,0.1].
1003 z = _mm256_and_ps(y, sieve);
1004 q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
1006 corr = _mm256_mul_ps(CD4, q);
1007 corr = _mm256_add_ps(corr, CD3);
1008 corr = _mm256_mul_ps(corr, q);
1009 corr = _mm256_add_ps(corr, CD2);
1010 corr = _mm256_mul_ps(corr, q);
1011 corr = _mm256_add_ps(corr, one);
1012 corr = _mm256_mul_ps(corr, q);
1013 corr = _mm256_add_ps(corr, one);
1015 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
1016 expmx2 = _mm256_mul_ps(expmx2, corr);
1018 pB1 = _mm256_mul_ps(CB9, w2);
1019 pB0 = _mm256_mul_ps(CB8, w2);
1020 pB1 = _mm256_add_ps(pB1, CB7);
1021 pB0 = _mm256_add_ps(pB0, CB6);
1022 pB1 = _mm256_mul_ps(pB1, w2);
1023 pB0 = _mm256_mul_ps(pB0, w2);
1024 pB1 = _mm256_add_ps(pB1, CB5);
1025 pB0 = _mm256_add_ps(pB0, CB4);
1026 pB1 = _mm256_mul_ps(pB1, w2);
1027 pB0 = _mm256_mul_ps(pB0, w2);
1028 pB1 = _mm256_add_ps(pB1, CB3);
1029 pB0 = _mm256_add_ps(pB0, CB2);
1030 pB1 = _mm256_mul_ps(pB1, w2);
1031 pB0 = _mm256_mul_ps(pB0, w2);
1032 pB1 = _mm256_add_ps(pB1, CB1);
1033 pB1 = _mm256_mul_ps(pB1, w);
1034 pB0 = _mm256_add_ps(pB0, pB1);
1035 pB0 = _mm256_add_ps(pB0, CB0);
1037 pC0 = _mm256_mul_ps(CC10, t2);
1038 pC1 = _mm256_mul_ps(CC9, t2);
1039 pC0 = _mm256_add_ps(pC0, CC8);
1040 pC1 = _mm256_add_ps(pC1, CC7);
1041 pC0 = _mm256_mul_ps(pC0, t2);
1042 pC1 = _mm256_mul_ps(pC1, t2);
1043 pC0 = _mm256_add_ps(pC0, CC6);
1044 pC1 = _mm256_add_ps(pC1, CC5);
1045 pC0 = _mm256_mul_ps(pC0, t2);
1046 pC1 = _mm256_mul_ps(pC1, t2);
1047 pC0 = _mm256_add_ps(pC0, CC4);
1048 pC1 = _mm256_add_ps(pC1, CC3);
1049 pC0 = _mm256_mul_ps(pC0, t2);
1050 pC1 = _mm256_mul_ps(pC1, t2);
1051 pC0 = _mm256_add_ps(pC0, CC2);
1052 pC1 = _mm256_add_ps(pC1, CC1);
1053 pC0 = _mm256_mul_ps(pC0, t2);
1054 pC1 = _mm256_mul_ps(pC1, t);
1055 pC0 = _mm256_add_ps(pC0, pC1);
1056 pC0 = _mm256_add_ps(pC0, CC0);
1057 pC0 = _mm256_mul_ps(pC0, t);
1059 /* SELECT pB0 or pC0 for erfc() */
1060 mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
1061 res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
1062 res_erfc = _mm256_mul_ps(res_erfc, expmx2);
1064 /* erfc(x<0) = 2-erfc(|x|) */
1065 mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
1066 res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
1068 /* Select erf() or erfc() */
1069 mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
1070 res = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
1076 /* erfc(), 128 bit wide */
1078 gmx_mm_erfc_ps(__m128 x)
1080 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1081 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
1082 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
1083 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
1084 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
1085 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
1086 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
1087 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
1088 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1089 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
1090 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
1091 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
1092 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
1093 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
1094 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
1095 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
1096 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
1097 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
1098 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
1099 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1100 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
1101 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
1102 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
1103 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
1104 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
1105 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
1106 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
1107 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
1108 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
1109 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
1110 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
1112 /* Coefficients for expansion of exp(x) in [0,0.1] */
1113 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1114 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
1115 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
1116 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
1118 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1119 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1120 const __m128 one = _mm_set1_ps(1.0f);
1121 const __m128 two = _mm_set1_ps(2.0f);
1124 __m128 z, q, t, t2, w, w2;
1125 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
1126 __m128 expmx2, corr;
1127 __m128 res_erf, res_erfc, res;
1130 /* Calculate erf() */
1131 x2 = _mm_mul_ps(x, x);
1132 x4 = _mm_mul_ps(x2, x2);
1134 pA0 = _mm_mul_ps(CA6, x4);
1135 pA1 = _mm_mul_ps(CA5, x4);
1136 pA0 = _mm_add_ps(pA0, CA4);
1137 pA1 = _mm_add_ps(pA1, CA3);
1138 pA0 = _mm_mul_ps(pA0, x4);
1139 pA1 = _mm_mul_ps(pA1, x4);
1140 pA0 = _mm_add_ps(pA0, CA2);
1141 pA1 = _mm_add_ps(pA1, CA1);
1142 pA0 = _mm_mul_ps(pA0, x4);
1143 pA1 = _mm_mul_ps(pA1, x2);
1144 pA0 = _mm_add_ps(pA0, pA1);
1145 pA0 = _mm_add_ps(pA0, CA0);
1147 res_erf = _mm_mul_ps(x, pA0);
1149 /* Calculate erfc */
1150 y = gmx_mm_abs_ps(x);
1151 t = gmx_mm_inv_ps(y);
1152 w = _mm_sub_ps(t, one);
1153 t2 = _mm_mul_ps(t, t);
1154 w2 = _mm_mul_ps(w, w);
1156 * We cannot simply calculate exp(-x2) directly in single precision, since
1157 * that will lose a couple of bits of precision due to the multiplication.
1158 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1159 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1161 * The only drawback with this is that it requires TWO separate exponential
1162 * evaluations, which would be horrible performance-wise. However, the argument
1163 * for the second exp() call is always small, so there we simply use a
1164 * low-order minimax expansion on [0,0.1].
1167 z = _mm_and_ps(y, sieve);
1168 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
1170 corr = _mm_mul_ps(CD4, q);
1171 corr = _mm_add_ps(corr, CD3);
1172 corr = _mm_mul_ps(corr, q);
1173 corr = _mm_add_ps(corr, CD2);
1174 corr = _mm_mul_ps(corr, q);
1175 corr = _mm_add_ps(corr, one);
1176 corr = _mm_mul_ps(corr, q);
1177 corr = _mm_add_ps(corr, one);
1179 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
1180 expmx2 = _mm_mul_ps(expmx2, corr);
1182 pB1 = _mm_mul_ps(CB9, w2);
1183 pB0 = _mm_mul_ps(CB8, w2);
1184 pB1 = _mm_add_ps(pB1, CB7);
1185 pB0 = _mm_add_ps(pB0, CB6);
1186 pB1 = _mm_mul_ps(pB1, w2);
1187 pB0 = _mm_mul_ps(pB0, w2);
1188 pB1 = _mm_add_ps(pB1, CB5);
1189 pB0 = _mm_add_ps(pB0, CB4);
1190 pB1 = _mm_mul_ps(pB1, w2);
1191 pB0 = _mm_mul_ps(pB0, w2);
1192 pB1 = _mm_add_ps(pB1, CB3);
1193 pB0 = _mm_add_ps(pB0, CB2);
1194 pB1 = _mm_mul_ps(pB1, w2);
1195 pB0 = _mm_mul_ps(pB0, w2);
1196 pB1 = _mm_add_ps(pB1, CB1);
1197 pB1 = _mm_mul_ps(pB1, w);
1198 pB0 = _mm_add_ps(pB0, pB1);
1199 pB0 = _mm_add_ps(pB0, CB0);
1201 pC0 = _mm_mul_ps(CC10, t2);
1202 pC1 = _mm_mul_ps(CC9, t2);
1203 pC0 = _mm_add_ps(pC0, CC8);
1204 pC1 = _mm_add_ps(pC1, CC7);
1205 pC0 = _mm_mul_ps(pC0, t2);
1206 pC1 = _mm_mul_ps(pC1, t2);
1207 pC0 = _mm_add_ps(pC0, CC6);
1208 pC1 = _mm_add_ps(pC1, CC5);
1209 pC0 = _mm_mul_ps(pC0, t2);
1210 pC1 = _mm_mul_ps(pC1, t2);
1211 pC0 = _mm_add_ps(pC0, CC4);
1212 pC1 = _mm_add_ps(pC1, CC3);
1213 pC0 = _mm_mul_ps(pC0, t2);
1214 pC1 = _mm_mul_ps(pC1, t2);
1215 pC0 = _mm_add_ps(pC0, CC2);
1216 pC1 = _mm_add_ps(pC1, CC1);
1217 pC0 = _mm_mul_ps(pC0, t2);
1218 pC1 = _mm_mul_ps(pC1, t);
1219 pC0 = _mm_add_ps(pC0, pC1);
1220 pC0 = _mm_add_ps(pC0, CC0);
1221 pC0 = _mm_mul_ps(pC0, t);
1223 /* SELECT pB0 or pC0 for erfc() */
1224 mask = _mm_cmplt_ps(two, y);
1225 res_erfc = _mm_blendv_ps(pB0, pC0, mask);
1226 res_erfc = _mm_mul_ps(res_erfc, expmx2);
1228 /* erfc(x<0) = 2-erfc(|x|) */
1229 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
1230 res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
1232 /* Select erf() or erfc() */
1233 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
1234 res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
1241 /* Calculate the force correction due to PME analytically.
1243 * This routine is meant to enable analytical evaluation of the
1244 * direct-space PME electrostatic force to avoid tables.
1246 * The direct-space potential should be Erfc(beta*r)/r, but there
1247 * are some problems evaluating that:
1249 * First, the error function is difficult (read: expensive) to
1250 * approxmiate accurately for intermediate to large arguments, and
1251 * this happens already in ranges of beta*r that occur in simulations.
1252 * Second, we now try to avoid calculating potentials in Gromacs but
1253 * use forces directly.
1255 * We can simply things slight by noting that the PME part is really
1256 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1258 * V= 1/r - Erf(beta*r)/r
1260 * The first term we already have from the inverse square root, so
1261 * that we can leave out of this routine.
1263 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1264 * the argument beta*r will be in the range 0.15 to ~4. Use your
1265 * favorite plotting program to realize how well-behaved Erf(z)/z is
1268 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1269 * However, it turns out it is more efficient to approximate f(z)/z and
1270 * then only use even powers. This is another minor optimization, since
1271 * we actually WANT f(z)/z, because it is going to be multiplied by
1272 * the vector between the two atoms to get the vectorial force. The
1273 * fastest flops are the ones we can avoid calculating!
1275 * So, here's how it should be used:
1278 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1279 * 3. Evaluate this routine with z^2 as the argument.
1280 * 4. The return value is the expression:
1283 * 2*exp(-z^2) erf(z)
1284 * ------------ - --------
1287 * 5. Multiply the entire expression by beta^3. This will get you
1289 * beta^3*2*exp(-z^2) beta^3*erf(z)
1290 * ------------------ - ---------------
1293 * or, switching back to r (z=r*beta):
1295 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
1296 * ----------------------- - -----------
1300 * With a bit of math exercise you should be able to confirm that
1301 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1303 * 6. Add the result to 1/r^3, multiply by the product of the charges,
1304 * and you have your force (divided by r). A final multiplication
1305 * with the vector connecting the two particles and you have your
1306 * vectorial force to add to the particles.
1310 gmx_mm256_pmecorrF_ps(__m256 z2)
1312 const __m256 FN6 = _mm256_set1_ps(-1.7357322914161492954e-8f);
1313 const __m256 FN5 = _mm256_set1_ps(1.4703624142580877519e-6f);
1314 const __m256 FN4 = _mm256_set1_ps(-0.000053401640219807709149f);
1315 const __m256 FN3 = _mm256_set1_ps(0.0010054721316683106153f);
1316 const __m256 FN2 = _mm256_set1_ps(-0.019278317264888380590f);
1317 const __m256 FN1 = _mm256_set1_ps(0.069670166153766424023f);
1318 const __m256 FN0 = _mm256_set1_ps(-0.75225204789749321333f);
1320 const __m256 FD4 = _mm256_set1_ps(0.0011193462567257629232f);
1321 const __m256 FD3 = _mm256_set1_ps(0.014866955030185295499f);
1322 const __m256 FD2 = _mm256_set1_ps(0.11583842382862377919f);
1323 const __m256 FD1 = _mm256_set1_ps(0.50736591960530292870f);
1324 const __m256 FD0 = _mm256_set1_ps(1.0f);
1327 __m256 polyFN0, polyFN1, polyFD0, polyFD1;
1329 z4 = _mm256_mul_ps(z2, z2);
1331 polyFD0 = _mm256_mul_ps(FD4, z4);
1332 polyFD1 = _mm256_mul_ps(FD3, z4);
1333 polyFD0 = _mm256_add_ps(polyFD0, FD2);
1334 polyFD1 = _mm256_add_ps(polyFD1, FD1);
1335 polyFD0 = _mm256_mul_ps(polyFD0, z4);
1336 polyFD1 = _mm256_mul_ps(polyFD1, z2);
1337 polyFD0 = _mm256_add_ps(polyFD0, FD0);
1338 polyFD0 = _mm256_add_ps(polyFD0, polyFD1);
1340 polyFD0 = gmx_mm256_inv_ps(polyFD0);
1342 polyFN0 = _mm256_mul_ps(FN6, z4);
1343 polyFN1 = _mm256_mul_ps(FN5, z4);
1344 polyFN0 = _mm256_add_ps(polyFN0, FN4);
1345 polyFN1 = _mm256_add_ps(polyFN1, FN3);
1346 polyFN0 = _mm256_mul_ps(polyFN0, z4);
1347 polyFN1 = _mm256_mul_ps(polyFN1, z4);
1348 polyFN0 = _mm256_add_ps(polyFN0, FN2);
1349 polyFN1 = _mm256_add_ps(polyFN1, FN1);
1350 polyFN0 = _mm256_mul_ps(polyFN0, z4);
1351 polyFN1 = _mm256_mul_ps(polyFN1, z2);
1352 polyFN0 = _mm256_add_ps(polyFN0, FN0);
1353 polyFN0 = _mm256_add_ps(polyFN0, polyFN1);
1355 return _mm256_mul_ps(polyFN0, polyFD0);
1360 gmx_mm_pmecorrF_ps(__m128 z2)
1362 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
1363 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
1364 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
1365 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
1366 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
1367 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
1368 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
1370 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
1371 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
1372 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
1373 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
1374 const __m128 FD0 = _mm_set1_ps(1.0f);
1377 __m128 polyFN0, polyFN1, polyFD0, polyFD1;
1379 z4 = _mm_mul_ps(z2, z2);
1381 polyFD0 = _mm_mul_ps(FD4, z4);
1382 polyFD1 = _mm_mul_ps(FD3, z4);
1383 polyFD0 = _mm_add_ps(polyFD0, FD2);
1384 polyFD1 = _mm_add_ps(polyFD1, FD1);
1385 polyFD0 = _mm_mul_ps(polyFD0, z4);
1386 polyFD1 = _mm_mul_ps(polyFD1, z2);
1387 polyFD0 = _mm_add_ps(polyFD0, FD0);
1388 polyFD0 = _mm_add_ps(polyFD0, polyFD1);
1390 polyFD0 = gmx_mm_inv_ps(polyFD0);
1392 polyFN0 = _mm_mul_ps(FN6, z4);
1393 polyFN1 = _mm_mul_ps(FN5, z4);
1394 polyFN0 = _mm_add_ps(polyFN0, FN4);
1395 polyFN1 = _mm_add_ps(polyFN1, FN3);
1396 polyFN0 = _mm_mul_ps(polyFN0, z4);
1397 polyFN1 = _mm_mul_ps(polyFN1, z4);
1398 polyFN0 = _mm_add_ps(polyFN0, FN2);
1399 polyFN1 = _mm_add_ps(polyFN1, FN1);
1400 polyFN0 = _mm_mul_ps(polyFN0, z4);
1401 polyFN1 = _mm_mul_ps(polyFN1, z2);
1402 polyFN0 = _mm_add_ps(polyFN0, FN0);
1403 polyFN0 = _mm_add_ps(polyFN0, polyFN1);
1405 return _mm_mul_ps(polyFN0, polyFD0);
1410 /* Calculate the potential correction due to PME analytically.
1412 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1414 * This routine calculates Erf(z)/z, although you should provide z^2
1415 * as the input argument.
1417 * Here's how it should be used:
1420 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1421 * 3. Evaluate this routine with z^2 as the argument.
1422 * 4. The return value is the expression:
1429 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1435 * 6. Subtract the result from 1/r, multiply by the product of the charges,
1436 * and you have your potential.
1439 gmx_mm256_pmecorrV_ps(__m256 z2)
1441 const __m256 VN6 = _mm256_set1_ps(1.9296833005951166339e-8f);
1442 const __m256 VN5 = _mm256_set1_ps(-1.4213390571557850962e-6f);
1443 const __m256 VN4 = _mm256_set1_ps(0.000041603292906656984871f);
1444 const __m256 VN3 = _mm256_set1_ps(-0.00013134036773265025626f);
1445 const __m256 VN2 = _mm256_set1_ps(0.038657983986041781264f);
1446 const __m256 VN1 = _mm256_set1_ps(0.11285044772717598220f);
1447 const __m256 VN0 = _mm256_set1_ps(1.1283802385263030286f);
1449 const __m256 VD3 = _mm256_set1_ps(0.0066752224023576045451f);
1450 const __m256 VD2 = _mm256_set1_ps(0.078647795836373922256f);
1451 const __m256 VD1 = _mm256_set1_ps(0.43336185284710920150f);
1452 const __m256 VD0 = _mm256_set1_ps(1.0f);
1455 __m256 polyVN0, polyVN1, polyVD0, polyVD1;
1457 z4 = _mm256_mul_ps(z2, z2);
1459 polyVD1 = _mm256_mul_ps(VD3, z4);
1460 polyVD0 = _mm256_mul_ps(VD2, z4);
1461 polyVD1 = _mm256_add_ps(polyVD1, VD1);
1462 polyVD0 = _mm256_add_ps(polyVD0, VD0);
1463 polyVD1 = _mm256_mul_ps(polyVD1, z2);
1464 polyVD0 = _mm256_add_ps(polyVD0, polyVD1);
1466 polyVD0 = gmx_mm256_inv_ps(polyVD0);
1468 polyVN0 = _mm256_mul_ps(VN6, z4);
1469 polyVN1 = _mm256_mul_ps(VN5, z4);
1470 polyVN0 = _mm256_add_ps(polyVN0, VN4);
1471 polyVN1 = _mm256_add_ps(polyVN1, VN3);
1472 polyVN0 = _mm256_mul_ps(polyVN0, z4);
1473 polyVN1 = _mm256_mul_ps(polyVN1, z4);
1474 polyVN0 = _mm256_add_ps(polyVN0, VN2);
1475 polyVN1 = _mm256_add_ps(polyVN1, VN1);
1476 polyVN0 = _mm256_mul_ps(polyVN0, z4);
1477 polyVN1 = _mm256_mul_ps(polyVN1, z2);
1478 polyVN0 = _mm256_add_ps(polyVN0, VN0);
1479 polyVN0 = _mm256_add_ps(polyVN0, polyVN1);
1481 return _mm256_mul_ps(polyVN0, polyVD0);
1486 gmx_mm_pmecorrV_ps(__m128 z2)
1488 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
1489 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
1490 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
1491 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
1492 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
1493 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
1494 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
1496 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
1497 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
1498 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
1499 const __m128 VD0 = _mm_set1_ps(1.0f);
1502 __m128 polyVN0, polyVN1, polyVD0, polyVD1;
1504 z4 = _mm_mul_ps(z2, z2);
1506 polyVD1 = _mm_mul_ps(VD3, z4);
1507 polyVD0 = _mm_mul_ps(VD2, z4);
1508 polyVD1 = _mm_add_ps(polyVD1, VD1);
1509 polyVD0 = _mm_add_ps(polyVD0, VD0);
1510 polyVD1 = _mm_mul_ps(polyVD1, z2);
1511 polyVD0 = _mm_add_ps(polyVD0, polyVD1);
1513 polyVD0 = gmx_mm_inv_ps(polyVD0);
1515 polyVN0 = _mm_mul_ps(VN6, z4);
1516 polyVN1 = _mm_mul_ps(VN5, z4);
1517 polyVN0 = _mm_add_ps(polyVN0, VN4);
1518 polyVN1 = _mm_add_ps(polyVN1, VN3);
1519 polyVN0 = _mm_mul_ps(polyVN0, z4);
1520 polyVN1 = _mm_mul_ps(polyVN1, z4);
1521 polyVN0 = _mm_add_ps(polyVN0, VN2);
1522 polyVN1 = _mm_add_ps(polyVN1, VN1);
1523 polyVN0 = _mm_mul_ps(polyVN0, z4);
1524 polyVN1 = _mm_mul_ps(polyVN1, z2);
1525 polyVN0 = _mm_add_ps(polyVN0, VN0);
1526 polyVN0 = _mm_add_ps(polyVN0, polyVN1);
1528 return _mm_mul_ps(polyVN0, polyVD0);
1533 gmx_mm256_sincos_ps(__m256 x,
1537 const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1538 const __m256 half = _mm256_set1_ps(0.5f);
1539 const __m256 one = _mm256_set1_ps(1.0f);
1540 const __m256 zero = _mm256_setzero_ps();
1542 const __m128i ione = _mm_set1_epi32(1);
1544 const __m256 mask_one = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1545 const __m256 mask_two = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1546 const __m256 mask_three = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1548 const __m256 CA1 = _mm256_set1_ps(1.5703125f);
1549 const __m256 CA2 = _mm256_set1_ps(4.837512969970703125e-4f);
1550 const __m256 CA3 = _mm256_set1_ps(7.54978995489188216e-8f);
1552 const __m256 CC0 = _mm256_set1_ps(-0.0013602249f);
1553 const __m256 CC1 = _mm256_set1_ps(0.0416566950f);
1554 const __m256 CC2 = _mm256_set1_ps(-0.4999990225f);
1555 const __m256 CS0 = _mm256_set1_ps(-0.0001950727f);
1556 const __m256 CS1 = _mm256_set1_ps(0.0083320758f);
1557 const __m256 CS2 = _mm256_set1_ps(-0.1666665247f);
1559 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1564 __m128i iz_high, iz_low;
1565 __m256 offset_sin, offset_cos;
1566 __m256 mask_sin, mask_cos;
1568 __m256 tmp_sin, tmp_cos;
1570 y = _mm256_mul_ps(x, two_over_pi);
1571 y = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
1573 iz = _mm256_cvttps_epi32(y);
1574 z = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
1576 offset_sin = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
1578 iz_high = _mm256_extractf128_si256(iz, 0x1);
1579 iz_low = _mm256_castsi256_si128(iz);
1580 iz_low = _mm_add_epi32(iz_low, ione);
1581 iz_high = _mm_add_epi32(iz_high, ione);
1582 iz = _mm256_castsi128_si256(iz_low);
1583 iz = _mm256_insertf128_si256(iz, iz_high, 0x1);
1584 offset_cos = _mm256_castsi256_ps(iz);
1586 /* Extended precision arithmethic to achieve full precision */
1587 y = _mm256_mul_ps(z, CA1);
1588 tmp1 = _mm256_mul_ps(z, CA2);
1589 tmp2 = _mm256_mul_ps(z, CA3);
1590 y = _mm256_sub_ps(x, y);
1591 y = _mm256_sub_ps(y, tmp1);
1592 y = _mm256_sub_ps(y, tmp2);
1594 y2 = _mm256_mul_ps(y, y);
1596 tmp1 = _mm256_mul_ps(CC0, y2);
1597 tmp1 = _mm256_add_ps(tmp1, CC1);
1598 tmp2 = _mm256_mul_ps(CS0, y2);
1599 tmp2 = _mm256_add_ps(tmp2, CS1);
1600 tmp1 = _mm256_mul_ps(tmp1, y2);
1601 tmp1 = _mm256_add_ps(tmp1, CC2);
1602 tmp2 = _mm256_mul_ps(tmp2, y2);
1603 tmp2 = _mm256_add_ps(tmp2, CS2);
1605 tmp1 = _mm256_mul_ps(tmp1, y2);
1606 tmp1 = _mm256_add_ps(tmp1, one);
1608 tmp2 = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
1609 tmp2 = _mm256_add_ps(tmp2, y);
1611 #ifdef __INTEL_COMPILER
1612 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1613 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
1614 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
1616 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
1617 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
1619 tmp_sin = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
1620 tmp_cos = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
1622 tmp1 = _mm256_xor_ps(signbit, tmp_sin);
1623 tmp2 = _mm256_xor_ps(signbit, tmp_cos);
1625 #ifdef __INTEL_COMPILER
1626 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1627 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
1628 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
1630 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
1631 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
1634 *sinval = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
1635 *cosval = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
1641 gmx_mm_sincos_ps(__m128 x,
1645 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1646 const __m128 half = _mm_set1_ps(0.5);
1647 const __m128 one = _mm_set1_ps(1.0);
1649 const __m128i izero = _mm_set1_epi32(0);
1650 const __m128i ione = _mm_set1_epi32(1);
1651 const __m128i itwo = _mm_set1_epi32(2);
1652 const __m128i ithree = _mm_set1_epi32(3);
1653 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1655 const __m128 CA1 = _mm_set1_ps(1.5703125f);
1656 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
1657 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
1659 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
1660 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
1661 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
1662 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
1663 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
1664 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
1669 __m128i offset_sin, offset_cos;
1671 __m128 mask_sin, mask_cos;
1672 __m128 tmp_sin, tmp_cos;
1674 y = _mm_mul_ps(x, two_over_pi);
1675 y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
1677 iz = _mm_cvttps_epi32(y);
1678 z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
1680 offset_sin = _mm_and_si128(iz, ithree);
1681 offset_cos = _mm_add_epi32(iz, ione);
1683 /* Extended precision arithmethic to achieve full precision */
1684 y = _mm_mul_ps(z, CA1);
1685 tmp1 = _mm_mul_ps(z, CA2);
1686 tmp2 = _mm_mul_ps(z, CA3);
1687 y = _mm_sub_ps(x, y);
1688 y = _mm_sub_ps(y, tmp1);
1689 y = _mm_sub_ps(y, tmp2);
1691 y2 = _mm_mul_ps(y, y);
1693 tmp1 = _mm_mul_ps(CC0, y2);
1694 tmp1 = _mm_add_ps(tmp1, CC1);
1695 tmp2 = _mm_mul_ps(CS0, y2);
1696 tmp2 = _mm_add_ps(tmp2, CS1);
1697 tmp1 = _mm_mul_ps(tmp1, y2);
1698 tmp1 = _mm_add_ps(tmp1, CC2);
1699 tmp2 = _mm_mul_ps(tmp2, y2);
1700 tmp2 = _mm_add_ps(tmp2, CS2);
1702 tmp1 = _mm_mul_ps(tmp1, y2);
1703 tmp1 = _mm_add_ps(tmp1, one);
1705 tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
1706 tmp2 = _mm_add_ps(tmp2, y);
1708 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
1709 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
1711 tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
1712 tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
1714 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
1715 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
1717 tmp1 = _mm_xor_ps(signbit, tmp_sin);
1718 tmp2 = _mm_xor_ps(signbit, tmp_cos);
1720 *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
1721 *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
1730 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1731 * will then call the sincos() routine and waste a factor 2 in performance!
1734 gmx_mm256_sin_ps(__m256 x)
1737 gmx_mm256_sincos_ps(x, &s, &c);
1742 gmx_mm_sin_ps(__m128 x)
1745 gmx_mm_sincos_ps(x, &s, &c);
1751 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1752 * will then call the sincos() routine and waste a factor 2 in performance!
1755 gmx_mm256_cos_ps(__m256 x)
1758 gmx_mm256_sincos_ps(x, &s, &c);
1763 gmx_mm_cos_ps(__m128 x)
1766 gmx_mm_sincos_ps(x, &s, &c);
1772 gmx_mm256_tan_ps(__m256 x)
1774 __m256 sinval, cosval;
1777 gmx_mm256_sincos_ps(x, &sinval, &cosval);
1779 tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
1785 gmx_mm_tan_ps(__m128 x)
1787 __m128 sinval, cosval;
1790 gmx_mm_sincos_ps(x, &sinval, &cosval);
1792 tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
1799 gmx_mm256_asin_ps(__m256 x)
1801 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1802 const __m256 limitlow = _mm256_set1_ps(1e-4f);
1803 const __m256 half = _mm256_set1_ps(0.5f);
1804 const __m256 one = _mm256_set1_ps(1.0f);
1805 const __m256 halfpi = _mm256_set1_ps((float)M_PI/2.0f);
1807 const __m256 CC5 = _mm256_set1_ps(4.2163199048E-2f);
1808 const __m256 CC4 = _mm256_set1_ps(2.4181311049E-2f);
1809 const __m256 CC3 = _mm256_set1_ps(4.5470025998E-2f);
1810 const __m256 CC2 = _mm256_set1_ps(7.4953002686E-2f);
1811 const __m256 CC1 = _mm256_set1_ps(1.6666752422E-1f);
1816 __m256 z, z1, z2, q, q1, q2;
1819 sign = _mm256_andnot_ps(signmask, x);
1820 xabs = _mm256_and_ps(x, signmask);
1822 mask = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
1824 z1 = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
1825 q1 = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
1826 q1 = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
1829 z2 = _mm256_mul_ps(q2, q2);
1831 z = _mm256_blendv_ps(z2, z1, mask);
1832 q = _mm256_blendv_ps(q2, q1, mask);
1834 z2 = _mm256_mul_ps(z, z);
1836 pA = _mm256_mul_ps(CC5, z2);
1837 pB = _mm256_mul_ps(CC4, z2);
1839 pA = _mm256_add_ps(pA, CC3);
1840 pB = _mm256_add_ps(pB, CC2);
1842 pA = _mm256_mul_ps(pA, z2);
1843 pB = _mm256_mul_ps(pB, z2);
1845 pA = _mm256_add_ps(pA, CC1);
1846 pA = _mm256_mul_ps(pA, z);
1848 z = _mm256_add_ps(pA, pB);
1849 z = _mm256_mul_ps(z, q);
1850 z = _mm256_add_ps(z, q);
1852 q2 = _mm256_sub_ps(halfpi, z);
1853 q2 = _mm256_sub_ps(q2, z);
1855 z = _mm256_blendv_ps(z, q2, mask);
1857 mask = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
1858 z = _mm256_blendv_ps(xabs, z, mask);
1860 z = _mm256_xor_ps(z, sign);
1866 gmx_mm_asin_ps(__m128 x)
1868 /* Same algorithm as cephes library */
1869 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1870 const __m128 limitlow = _mm_set1_ps(1e-4f);
1871 const __m128 half = _mm_set1_ps(0.5f);
1872 const __m128 one = _mm_set1_ps(1.0f);
1873 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
1875 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
1876 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
1877 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
1878 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
1879 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
1884 __m128 z, z1, z2, q, q1, q2;
1887 sign = _mm_andnot_ps(signmask, x);
1888 xabs = _mm_and_ps(x, signmask);
1890 mask = _mm_cmpgt_ps(xabs, half);
1892 z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
1893 q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1894 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1897 z2 = _mm_mul_ps(q2, q2);
1899 z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1900 q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1902 z2 = _mm_mul_ps(z, z);
1904 pA = _mm_mul_ps(CC5, z2);
1905 pB = _mm_mul_ps(CC4, z2);
1907 pA = _mm_add_ps(pA, CC3);
1908 pB = _mm_add_ps(pB, CC2);
1910 pA = _mm_mul_ps(pA, z2);
1911 pB = _mm_mul_ps(pB, z2);
1913 pA = _mm_add_ps(pA, CC1);
1914 pA = _mm_mul_ps(pA, z);
1916 z = _mm_add_ps(pA, pB);
1917 z = _mm_mul_ps(z, q);
1918 z = _mm_add_ps(z, q);
1920 q2 = _mm_sub_ps(halfpi, z);
1921 q2 = _mm_sub_ps(q2, z);
1923 z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1925 mask = _mm_cmpgt_ps(xabs, limitlow);
1926 z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1928 z = _mm_xor_ps(z, sign);
1935 gmx_mm256_acos_ps(__m256 x)
1937 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1938 const __m256 one_ps = _mm256_set1_ps(1.0f);
1939 const __m256 half_ps = _mm256_set1_ps(0.5f);
1940 const __m256 pi_ps = _mm256_set1_ps((float)M_PI);
1941 const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1946 __m256 z, z1, z2, z3;
1948 xabs = _mm256_and_ps(x, signmask);
1949 mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
1950 mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
1952 z = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
1953 z = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
1954 z = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
1956 z = _mm256_blendv_ps(x, z, mask1);
1957 z = gmx_mm256_asin_ps(z);
1959 z2 = _mm256_add_ps(z, z);
1960 z1 = _mm256_sub_ps(pi_ps, z2);
1961 z3 = _mm256_sub_ps(halfpi_ps, z);
1963 z = _mm256_blendv_ps(z1, z2, mask2);
1964 z = _mm256_blendv_ps(z3, z, mask1);
1970 gmx_mm_acos_ps(__m128 x)
1972 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1973 const __m128 one_ps = _mm_set1_ps(1.0f);
1974 const __m128 half_ps = _mm_set1_ps(0.5f);
1975 const __m128 pi_ps = _mm_set1_ps(M_PI);
1976 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1981 __m128 z, z1, z2, z3;
1983 xabs = _mm_and_ps(x, signmask);
1984 mask1 = _mm_cmpgt_ps(xabs, half_ps);
1985 mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1987 z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1988 z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1989 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1991 z = _mm_blendv_ps(x, z, mask1);
1992 z = gmx_mm_asin_ps(z);
1994 z2 = _mm_add_ps(z, z);
1995 z1 = _mm_sub_ps(pi_ps, z2);
1996 z3 = _mm_sub_ps(halfpi_ps, z);
1998 z = _mm_blendv_ps(z1, z2, mask2);
1999 z = _mm_blendv_ps(z3, z, mask1);
2006 gmx_mm256_atan_ps(__m256 x)
2008 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2009 const __m256 limit1 = _mm256_set1_ps(0.414213562373095f);
2010 const __m256 limit2 = _mm256_set1_ps(2.414213562373095f);
2011 const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2012 const __m256 halfpi = _mm256_set1_ps(1.570796326794896f);
2013 const __m256 mone = _mm256_set1_ps(-1.0f);
2014 const __m256 CC3 = _mm256_set1_ps(-3.33329491539E-1f);
2015 const __m256 CC5 = _mm256_set1_ps(1.99777106478E-1f);
2016 const __m256 CC7 = _mm256_set1_ps(-1.38776856032E-1);
2017 const __m256 CC9 = _mm256_set1_ps(8.05374449538e-2f);
2020 __m256 mask1, mask2;
2025 sign = _mm256_andnot_ps(signmask, x);
2026 x = _mm256_and_ps(x, signmask);
2028 mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
2029 mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
2031 z1 = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
2032 z2 = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
2034 y = _mm256_and_ps(mask1, quarterpi);
2035 y = _mm256_blendv_ps(y, halfpi, mask2);
2037 x = _mm256_blendv_ps(x, z1, mask1);
2038 x = _mm256_blendv_ps(x, z2, mask2);
2040 x2 = _mm256_mul_ps(x, x);
2041 x4 = _mm256_mul_ps(x2, x2);
2043 sum1 = _mm256_mul_ps(CC9, x4);
2044 sum2 = _mm256_mul_ps(CC7, x4);
2045 sum1 = _mm256_add_ps(sum1, CC5);
2046 sum2 = _mm256_add_ps(sum2, CC3);
2047 sum1 = _mm256_mul_ps(sum1, x4);
2048 sum2 = _mm256_mul_ps(sum2, x2);
2050 sum1 = _mm256_add_ps(sum1, sum2);
2051 sum1 = _mm256_sub_ps(sum1, mone);
2052 sum1 = _mm256_mul_ps(sum1, x);
2053 y = _mm256_add_ps(y, sum1);
2055 y = _mm256_xor_ps(y, sign);
2061 gmx_mm_atan_ps(__m128 x)
2063 /* Same algorithm as cephes library */
2064 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2065 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
2066 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
2067 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2068 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
2069 const __m128 mone = _mm_set1_ps(-1.0f);
2070 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
2071 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
2072 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
2073 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
2076 __m128 mask1, mask2;
2081 sign = _mm_andnot_ps(signmask, x);
2082 x = _mm_and_ps(x, signmask);
2084 mask1 = _mm_cmpgt_ps(x, limit1);
2085 mask2 = _mm_cmpgt_ps(x, limit2);
2087 z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
2088 z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
2090 y = _mm_and_ps(mask1, quarterpi);
2091 y = _mm_blendv_ps(y, halfpi, mask2);
2093 x = _mm_blendv_ps(x, z1, mask1);
2094 x = _mm_blendv_ps(x, z2, mask2);
2096 x2 = _mm_mul_ps(x, x);
2097 x4 = _mm_mul_ps(x2, x2);
2099 sum1 = _mm_mul_ps(CC9, x4);
2100 sum2 = _mm_mul_ps(CC7, x4);
2101 sum1 = _mm_add_ps(sum1, CC5);
2102 sum2 = _mm_add_ps(sum2, CC3);
2103 sum1 = _mm_mul_ps(sum1, x4);
2104 sum2 = _mm_mul_ps(sum2, x2);
2106 sum1 = _mm_add_ps(sum1, sum2);
2107 sum1 = _mm_sub_ps(sum1, mone);
2108 sum1 = _mm_mul_ps(sum1, x);
2109 y = _mm_add_ps(y, sum1);
2111 y = _mm_xor_ps(y, sign);
2118 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2120 const __m256 pi = _mm256_set1_ps( (float) M_PI);
2121 const __m256 minuspi = _mm256_set1_ps( (float) -M_PI);
2122 const __m256 halfpi = _mm256_set1_ps( (float) M_PI/2.0f);
2123 const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2125 __m256 z, z1, z3, z4;
2127 __m256 maskx_lt, maskx_eq;
2128 __m256 masky_lt, masky_eq;
2129 __m256 mask1, mask2, mask3, mask4, maskall;
2131 maskx_lt = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
2132 masky_lt = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
2133 maskx_eq = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
2134 masky_eq = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
2136 z = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
2137 z = gmx_mm256_atan_ps(z);
2139 mask1 = _mm256_and_ps(maskx_eq, masky_lt);
2140 mask2 = _mm256_andnot_ps(maskx_lt, masky_eq);
2141 mask3 = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
2142 mask4 = _mm256_and_ps(maskx_lt, masky_eq);
2143 maskall = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
2145 z = _mm256_andnot_ps(maskall, z);
2146 z1 = _mm256_and_ps(mask1, minushalfpi);
2147 z3 = _mm256_and_ps(mask3, halfpi);
2148 z4 = _mm256_and_ps(mask4, pi);
2150 z = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
2152 w = _mm256_blendv_ps(pi, minuspi, masky_lt);
2153 w = _mm256_and_ps(w, maskx_lt);
2155 w = _mm256_andnot_ps(maskall, w);
2157 z = _mm256_add_ps(z, w);
2163 gmx_mm_atan2_ps(__m128 y, __m128 x)
2165 const __m128 pi = _mm_set1_ps(M_PI);
2166 const __m128 minuspi = _mm_set1_ps(-M_PI);
2167 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
2168 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2170 __m128 z, z1, z3, z4;
2172 __m128 maskx_lt, maskx_eq;
2173 __m128 masky_lt, masky_eq;
2174 __m128 mask1, mask2, mask3, mask4, maskall;
2176 maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
2177 masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
2178 maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
2179 masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
2181 z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
2182 z = gmx_mm_atan_ps(z);
2184 mask1 = _mm_and_ps(maskx_eq, masky_lt);
2185 mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
2186 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
2187 mask4 = _mm_and_ps(masky_eq, maskx_lt);
2189 maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
2191 z = _mm_andnot_ps(maskall, z);
2192 z1 = _mm_and_ps(mask1, minushalfpi);
2193 z3 = _mm_and_ps(mask3, halfpi);
2194 z4 = _mm_and_ps(mask4, pi);
2196 z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
2198 mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
2199 mask2 = _mm_and_ps(maskx_lt, masky_lt);
2201 w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
2202 w = _mm_andnot_ps(maskall, w);
2204 z = _mm_add_ps(z, w);
2209 #endif /* _gmx_math_x86_avx_256_single_h_ */