2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
6 * others, as listed in the AUTHORS file in the top-level source
7 * directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _gmx_math_x86_avx_256_single_h_
36 #define _gmx_math_x86_avx_256_single_h_
38 #include "gmx_x86_avx_256.h"
42 # define M_PI 3.14159265358979323846264338327950288
47 /************************
49 * Simple math routines *
51 ************************/
53 /* 1.0/sqrt(x), 256-bit wide version */
54 static gmx_inline __m256
55 gmx_mm256_invsqrt_ps(__m256 x)
57 const __m256 half = _mm256_set1_ps(0.5f);
58 const __m256 three = _mm256_set1_ps(3.0f);
60 __m256 lu = _mm256_rsqrt_ps(x);
62 return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
65 /* 1.0/sqrt(x), 128-bit wide version */
66 static gmx_inline __m128
67 gmx_mm_invsqrt_ps(__m128 x)
69 const __m128 half = _mm_set_ps(0.5,0.5,0.5,0.5);
70 const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
72 __m128 lu = _mm_rsqrt_ps(x);
74 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
78 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
79 static gmx_inline __m256
80 gmx_mm256_sqrt_ps(__m256 x)
85 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
86 res = _mm256_andnot_ps(mask,gmx_mm256_invsqrt_ps(x));
88 res = _mm256_mul_ps(x,res);
93 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
94 static gmx_inline __m128
95 gmx_mm_sqrt_ps(__m128 x)
100 mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
101 res = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
103 res = _mm_mul_ps(x,res);
109 /* 1.0/x, 256-bit wide */
110 static gmx_inline __m256
111 gmx_mm256_inv_ps(__m256 x)
113 const __m256 two = _mm256_set1_ps(2.0f);
115 __m256 lu = _mm256_rcp_ps(x);
117 return _mm256_mul_ps(lu,_mm256_sub_ps(two,_mm256_mul_ps(lu,x)));
120 /* 1.0/x, 128-bit wide */
121 static gmx_inline __m128
122 gmx_mm_inv_ps(__m128 x)
124 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
126 __m128 lu = _mm_rcp_ps(x);
128 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
132 static gmx_inline __m256
133 gmx_mm256_abs_ps(__m256 x)
135 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
137 return _mm256_and_ps(x,signmask);
140 static gmx_inline __m128
141 gmx_mm_abs_ps(__m128 x)
143 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
145 return _mm_and_ps(x,signmask);
150 gmx_mm256_log_ps(__m256 x)
152 const __m256 expmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
153 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
154 const __m256 half = _mm256_set1_ps(0.5f);
155 const __m256 one = _mm256_set1_ps(1.0f);
156 const __m256 invsq2 = _mm256_set1_ps(1.0f/sqrt(2.0f));
157 const __m256 corr1 = _mm256_set1_ps(-2.12194440e-4f);
158 const __m256 corr2 = _mm256_set1_ps(0.693359375f);
160 const __m256 CA_1 = _mm256_set1_ps(0.070376836292f);
161 const __m256 CB_0 = _mm256_set1_ps(1.6714950086782716f);
162 const __m256 CB_1 = _mm256_set1_ps(-2.452088066061482f);
163 const __m256 CC_0 = _mm256_set1_ps(1.5220770854701728f);
164 const __m256 CC_1 = _mm256_set1_ps(-1.3422238433233642f);
165 const __m256 CD_0 = _mm256_set1_ps(1.386218787509749f);
166 const __m256 CD_1 = _mm256_set1_ps(0.35075468953796346f);
167 const __m256 CE_0 = _mm256_set1_ps(1.3429983063133937f);
168 const __m256 CE_1 = _mm256_set1_ps(1.807420826584643f);
172 __m128i iexp128a,iexp128b;
175 __m128i imask128a,imask128b;
178 __m256 pA,pB,pC,pD,pE,tB,tC,tD,tE;
180 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
181 fexp = _mm256_and_ps(x,expmask);
182 iexp = _mm256_castps_si256(fexp);
184 iexp128b = _mm256_extractf128_si256(iexp,0x1);
185 iexp128a = _mm256_castsi256_si128(iexp);
187 iexp128a = _mm_srli_epi32(iexp128a,23);
188 iexp128b = _mm_srli_epi32(iexp128b,23);
189 iexp128a = _mm_sub_epi32(iexp128a,expbase_m1);
190 iexp128b = _mm_sub_epi32(iexp128b,expbase_m1);
192 x = _mm256_andnot_ps(expmask,x);
193 x = _mm256_or_ps(x,one);
194 x = _mm256_mul_ps(x,half);
196 mask = _mm256_cmp_ps(x,invsq2,_CMP_LT_OQ);
198 x = _mm256_add_ps(x,_mm256_and_ps(mask,x));
199 x = _mm256_sub_ps(x,one);
201 imask = _mm256_castps_si256(mask);
203 imask128b = _mm256_extractf128_si256(imask,0x1);
204 imask128a = _mm256_castsi256_si128(imask);
206 iexp128a = _mm_add_epi32(iexp128a,imask128a);
207 iexp128b = _mm_add_epi32(iexp128b,imask128b);
209 iexp = _mm256_castsi128_si256(iexp128a);
210 iexp = _mm256_insertf128_si256(iexp,iexp128b,0x1);
212 x2 = _mm256_mul_ps(x,x);
214 pA = _mm256_mul_ps(CA_1,x);
215 pB = _mm256_mul_ps(CB_1,x);
216 pC = _mm256_mul_ps(CC_1,x);
217 pD = _mm256_mul_ps(CD_1,x);
218 pE = _mm256_mul_ps(CE_1,x);
219 tB = _mm256_add_ps(CB_0,x2);
220 tC = _mm256_add_ps(CC_0,x2);
221 tD = _mm256_add_ps(CD_0,x2);
222 tE = _mm256_add_ps(CE_0,x2);
223 pB = _mm256_add_ps(pB,tB);
224 pC = _mm256_add_ps(pC,tC);
225 pD = _mm256_add_ps(pD,tD);
226 pE = _mm256_add_ps(pE,tE);
228 pA = _mm256_mul_ps(pA,pB);
229 pC = _mm256_mul_ps(pC,pD);
230 pE = _mm256_mul_ps(pE,x2);
231 pA = _mm256_mul_ps(pA,pC);
232 y = _mm256_mul_ps(pA,pE);
234 fexp = _mm256_cvtepi32_ps(iexp);
235 y = _mm256_add_ps(y,_mm256_mul_ps(fexp,corr1));
237 y = _mm256_sub_ps(y, _mm256_mul_ps(half,x2));
238 x2 = _mm256_add_ps(x,y);
240 x2 = _mm256_add_ps(x2,_mm256_mul_ps(fexp,corr2));
247 gmx_mm_log_ps(__m128 x)
249 /* Same algorithm as cephes library */
250 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
251 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
252 const __m128 half = _mm_set1_ps(0.5f);
253 const __m128 one = _mm_set1_ps(1.0f);
254 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
255 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
256 const __m128 corr2 = _mm_set1_ps(0.693359375f);
258 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
259 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
260 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
261 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
262 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
263 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
264 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
265 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
266 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
273 __m128 pA,pB,pC,pD,pE,tB,tC,tD,tE;
275 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
276 fexp = _mm_and_ps(x,expmask);
277 iexp = gmx_mm_castps_si128(fexp);
278 iexp = _mm_srli_epi32(iexp,23);
279 iexp = _mm_sub_epi32(iexp,expbase_m1);
281 x = _mm_andnot_ps(expmask,x);
282 x = _mm_or_ps(x,one);
283 x = _mm_mul_ps(x,half);
285 mask = _mm_cmplt_ps(x,invsq2);
287 x = _mm_add_ps(x,_mm_and_ps(mask,x));
288 x = _mm_sub_ps(x,one);
289 iexp = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
291 x2 = _mm_mul_ps(x,x);
293 pA = _mm_mul_ps(CA_1,x);
294 pB = _mm_mul_ps(CB_1,x);
295 pC = _mm_mul_ps(CC_1,x);
296 pD = _mm_mul_ps(CD_1,x);
297 pE = _mm_mul_ps(CE_1,x);
298 tB = _mm_add_ps(CB_0,x2);
299 tC = _mm_add_ps(CC_0,x2);
300 tD = _mm_add_ps(CD_0,x2);
301 tE = _mm_add_ps(CE_0,x2);
302 pB = _mm_add_ps(pB,tB);
303 pC = _mm_add_ps(pC,tC);
304 pD = _mm_add_ps(pD,tD);
305 pE = _mm_add_ps(pE,tE);
307 pA = _mm_mul_ps(pA,pB);
308 pC = _mm_mul_ps(pC,pD);
309 pE = _mm_mul_ps(pE,x2);
310 pA = _mm_mul_ps(pA,pC);
311 y = _mm_mul_ps(pA,pE);
313 fexp = _mm_cvtepi32_ps(iexp);
314 y = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
316 y = _mm_sub_ps(y, _mm_mul_ps(half,x2));
317 x2 = _mm_add_ps(x,y);
319 x2 = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
326 * 2^x function, 256-bit wide
328 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
329 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
331 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
333 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
334 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
335 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
336 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
337 * number instead. That would take a few extra cycles and not really help, since something is
338 * wrong if you are using single precision to work with numbers that cannot really be represented
339 * in single precision.
341 * The accuracy is at least 23 bits.
344 gmx_mm256_exp2_ps(__m256 x)
346 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
347 const __m256 arglimit = _mm256_set1_ps(126.0f);
349 const __m128i expbase = _mm_set1_epi32(127);
350 const __m256 CC6 = _mm256_set1_ps(1.535336188319500E-004);
351 const __m256 CC5 = _mm256_set1_ps(1.339887440266574E-003);
352 const __m256 CC4 = _mm256_set1_ps(9.618437357674640E-003);
353 const __m256 CC3 = _mm256_set1_ps(5.550332471162809E-002);
354 const __m256 CC2 = _mm256_set1_ps(2.402264791363012E-001);
355 const __m256 CC1 = _mm256_set1_ps(6.931472028550421E-001);
356 const __m256 CC0 = _mm256_set1_ps(1.0f);
361 __m128i iexppart128a,iexppart128b;
367 iexppart = _mm256_cvtps_epi32(x);
368 intpart = _mm256_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
370 iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
371 iexppart128a = _mm256_castsi256_si128(iexppart);
373 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
374 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
376 iexppart = _mm256_castsi128_si256(iexppart128a);
377 iexppart = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
378 valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(x),_CMP_GE_OQ);
379 fexppart = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
381 x = _mm256_sub_ps(x,intpart);
382 x2 = _mm256_mul_ps(x,x);
384 p0 = _mm256_mul_ps(CC6,x2);
385 p1 = _mm256_mul_ps(CC5,x2);
386 p0 = _mm256_add_ps(p0,CC4);
387 p1 = _mm256_add_ps(p1,CC3);
388 p0 = _mm256_mul_ps(p0,x2);
389 p1 = _mm256_mul_ps(p1,x2);
390 p0 = _mm256_add_ps(p0,CC2);
391 p1 = _mm256_add_ps(p1,CC1);
392 p0 = _mm256_mul_ps(p0,x2);
393 p1 = _mm256_mul_ps(p1,x);
394 p0 = _mm256_add_ps(p0,CC0);
395 p0 = _mm256_add_ps(p0,p1);
396 x = _mm256_mul_ps(p0,fexppart);
402 /* 2^x, 128 bit wide */
404 gmx_mm_exp2_ps(__m128 x)
406 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
407 const __m128 arglimit = _mm_set1_ps(126.0f);
409 const __m128i expbase = _mm_set1_epi32(127);
410 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
411 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
412 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
413 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
414 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
415 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
416 const __m128 CA0 = _mm_set1_ps(1.0f);
425 iexppart = _mm_cvtps_epi32(x);
426 intpart = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
427 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
428 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
429 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
431 x = _mm_sub_ps(x,intpart);
432 x2 = _mm_mul_ps(x,x);
434 p0 = _mm_mul_ps(CA6,x2);
435 p1 = _mm_mul_ps(CA5,x2);
436 p0 = _mm_add_ps(p0,CA4);
437 p1 = _mm_add_ps(p1,CA3);
438 p0 = _mm_mul_ps(p0,x2);
439 p1 = _mm_mul_ps(p1,x2);
440 p0 = _mm_add_ps(p0,CA2);
441 p1 = _mm_add_ps(p1,CA1);
442 p0 = _mm_mul_ps(p0,x2);
443 p1 = _mm_mul_ps(p1,x);
444 p0 = _mm_add_ps(p0,CA0);
445 p0 = _mm_add_ps(p0,p1);
446 x = _mm_mul_ps(p0,fexppart);
452 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
453 * where y=log2(e)*x, but there will then be a small rounding error since we lose some
454 * precision due to the multiplication. This will then be magnified a lot by the exponential.
456 * Instead, we calculate the fractional part directly as a minimax approximation of
457 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
458 * remaining after 2^y, which avoids the precision-loss.
459 * The final result is correct to within 1 LSB over the entire argument range.
462 gmx_mm256_exp_ps(__m256 exparg)
464 const __m256 argscale = _mm256_set1_ps(1.44269504088896341f);
465 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
466 const __m256 arglimit = _mm256_set1_ps(126.0f);
467 const __m128i expbase = _mm_set1_epi32(127);
469 const __m256 invargscale0 = _mm256_set1_ps(0.693359375f);
470 const __m256 invargscale1 = _mm256_set1_ps(-2.12194440e-4f);
472 const __m256 CE5 = _mm256_set1_ps(1.9875691500e-4f);
473 const __m256 CE4 = _mm256_set1_ps(1.3981999507e-3f);
474 const __m256 CE3 = _mm256_set1_ps(8.3334519073e-3f);
475 const __m256 CE2 = _mm256_set1_ps(4.1665795894e-2f);
476 const __m256 CE1 = _mm256_set1_ps(1.6666665459e-1f);
477 const __m256 CE0 = _mm256_set1_ps(5.0000001201e-1f);
478 const __m256 one = _mm256_set1_ps(1.0f);
480 __m256 exparg2,exp2arg;
484 __m128i iexppart128a,iexppart128b;
488 exp2arg = _mm256_mul_ps(exparg,argscale);
490 iexppart = _mm256_cvtps_epi32(exp2arg);
491 intpart = _mm256_round_ps(exp2arg,_MM_FROUND_TO_NEAREST_INT);
493 iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
494 iexppart128a = _mm256_castsi256_si128(iexppart);
496 iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
497 iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
499 iexppart = _mm256_castsi128_si256(iexppart128a);
500 iexppart = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
501 valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(exp2arg),_CMP_GE_OQ);
502 fexppart = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
504 /* Extended precision arithmetics */
505 exparg = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale0,intpart));
506 exparg = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale1,intpart));
508 exparg2 = _mm256_mul_ps(exparg,exparg);
510 pE1 = _mm256_mul_ps(CE5,exparg2);
511 pE0 = _mm256_mul_ps(CE4,exparg2);
512 pE1 = _mm256_add_ps(pE1,CE3);
513 pE0 = _mm256_add_ps(pE0,CE2);
514 pE1 = _mm256_mul_ps(pE1,exparg2);
515 pE0 = _mm256_mul_ps(pE0,exparg2);
516 pE1 = _mm256_add_ps(pE1,CE1);
517 pE0 = _mm256_add_ps(pE0,CE0);
518 pE1 = _mm256_mul_ps(pE1,exparg);
519 pE0 = _mm256_add_ps(pE0,pE1);
520 pE0 = _mm256_mul_ps(pE0,exparg2);
521 exparg = _mm256_add_ps(exparg,one);
522 exparg = _mm256_add_ps(exparg,pE0);
524 exparg = _mm256_mul_ps(exparg,fexppart);
530 /* exp(), 128 bit wide. */
532 gmx_mm_exp_ps(__m128 x)
534 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
535 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
536 const __m128 arglimit = _mm_set1_ps(126.0f);
537 const __m128i expbase = _mm_set1_epi32(127);
539 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
540 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
542 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
543 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
544 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
545 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
546 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
547 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
548 const __m128 one = _mm_set1_ps(1.0f);
557 y = _mm_mul_ps(x,argscale);
559 iexppart = _mm_cvtps_epi32(y);
560 intpart = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
562 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
563 valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
564 fexppart = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
566 /* Extended precision arithmetics */
567 x = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
568 x = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
570 x2 = _mm_mul_ps(x,x);
572 p1 = _mm_mul_ps(CC5,x2);
573 p0 = _mm_mul_ps(CC4,x2);
574 p1 = _mm_add_ps(p1,CC3);
575 p0 = _mm_add_ps(p0,CC2);
576 p1 = _mm_mul_ps(p1,x2);
577 p0 = _mm_mul_ps(p0,x2);
578 p1 = _mm_add_ps(p1,CC1);
579 p0 = _mm_add_ps(p0,CC0);
580 p1 = _mm_mul_ps(p1,x);
581 p0 = _mm_add_ps(p0,p1);
582 p0 = _mm_mul_ps(p0,x2);
583 x = _mm_add_ps(x,one);
584 x = _mm_add_ps(x,p0);
586 x = _mm_mul_ps(x,fexppart);
593 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
595 gmx_mm256_erf_ps(__m256 x)
597 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
598 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
599 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
600 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
601 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
602 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
603 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
604 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
605 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
606 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
607 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
608 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
609 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
610 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
611 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
612 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
613 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
614 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
615 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
616 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
617 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
618 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
619 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
620 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
621 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
622 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
623 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
624 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
625 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
626 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
627 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
629 /* Coefficients for expansion of exp(x) in [0,0.1] */
630 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
631 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
632 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
633 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
635 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
636 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
637 const __m256 one = _mm256_set1_ps(1.0f);
638 const __m256 two = _mm256_set1_ps(2.0f);
641 __m256 z,q,t,t2,w,w2;
642 __m256 pA0,pA1,pB0,pB1,pC0,pC1;
644 __m256 res_erf,res_erfc,res;
647 /* Calculate erf() */
648 x2 = _mm256_mul_ps(x,x);
649 x4 = _mm256_mul_ps(x2,x2);
651 pA0 = _mm256_mul_ps(CA6,x4);
652 pA1 = _mm256_mul_ps(CA5,x4);
653 pA0 = _mm256_add_ps(pA0,CA4);
654 pA1 = _mm256_add_ps(pA1,CA3);
655 pA0 = _mm256_mul_ps(pA0,x4);
656 pA1 = _mm256_mul_ps(pA1,x4);
657 pA0 = _mm256_add_ps(pA0,CA2);
658 pA1 = _mm256_add_ps(pA1,CA1);
659 pA0 = _mm256_mul_ps(pA0,x4);
660 pA1 = _mm256_mul_ps(pA1,x2);
661 pA0 = _mm256_add_ps(pA0,pA1);
662 pA0 = _mm256_add_ps(pA0,CA0);
664 res_erf = _mm256_mul_ps(x,pA0);
668 y = gmx_mm256_abs_ps(x);
669 t = gmx_mm256_inv_ps(y);
670 w = _mm256_sub_ps(t,one);
671 t2 = _mm256_mul_ps(t,t);
672 w2 = _mm256_mul_ps(w,w);
674 * We cannot simply calculate exp(-x2) directly in single precision, since
675 * that will lose a couple of bits of precision due to the multiplication.
676 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
677 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
679 * The only drawback with this is that it requires TWO separate exponential
680 * evaluations, which would be horrible performance-wise. However, the argument
681 * for the second exp() call is always small, so there we simply use a
682 * low-order minimax expansion on [0,0.1].
685 z = _mm256_and_ps(y,sieve);
686 q = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
688 corr = _mm256_mul_ps(CD4,q);
689 corr = _mm256_add_ps(corr,CD3);
690 corr = _mm256_mul_ps(corr,q);
691 corr = _mm256_add_ps(corr,CD2);
692 corr = _mm256_mul_ps(corr,q);
693 corr = _mm256_add_ps(corr,one);
694 corr = _mm256_mul_ps(corr,q);
695 corr = _mm256_add_ps(corr,one);
697 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
698 expmx2 = _mm256_mul_ps(expmx2,corr);
700 pB1 = _mm256_mul_ps(CB9,w2);
701 pB0 = _mm256_mul_ps(CB8,w2);
702 pB1 = _mm256_add_ps(pB1,CB7);
703 pB0 = _mm256_add_ps(pB0,CB6);
704 pB1 = _mm256_mul_ps(pB1,w2);
705 pB0 = _mm256_mul_ps(pB0,w2);
706 pB1 = _mm256_add_ps(pB1,CB5);
707 pB0 = _mm256_add_ps(pB0,CB4);
708 pB1 = _mm256_mul_ps(pB1,w2);
709 pB0 = _mm256_mul_ps(pB0,w2);
710 pB1 = _mm256_add_ps(pB1,CB3);
711 pB0 = _mm256_add_ps(pB0,CB2);
712 pB1 = _mm256_mul_ps(pB1,w2);
713 pB0 = _mm256_mul_ps(pB0,w2);
714 pB1 = _mm256_add_ps(pB1,CB1);
715 pB1 = _mm256_mul_ps(pB1,w);
716 pB0 = _mm256_add_ps(pB0,pB1);
717 pB0 = _mm256_add_ps(pB0,CB0);
719 pC0 = _mm256_mul_ps(CC10,t2);
720 pC1 = _mm256_mul_ps(CC9,t2);
721 pC0 = _mm256_add_ps(pC0,CC8);
722 pC1 = _mm256_add_ps(pC1,CC7);
723 pC0 = _mm256_mul_ps(pC0,t2);
724 pC1 = _mm256_mul_ps(pC1,t2);
725 pC0 = _mm256_add_ps(pC0,CC6);
726 pC1 = _mm256_add_ps(pC1,CC5);
727 pC0 = _mm256_mul_ps(pC0,t2);
728 pC1 = _mm256_mul_ps(pC1,t2);
729 pC0 = _mm256_add_ps(pC0,CC4);
730 pC1 = _mm256_add_ps(pC1,CC3);
731 pC0 = _mm256_mul_ps(pC0,t2);
732 pC1 = _mm256_mul_ps(pC1,t2);
733 pC0 = _mm256_add_ps(pC0,CC2);
734 pC1 = _mm256_add_ps(pC1,CC1);
735 pC0 = _mm256_mul_ps(pC0,t2);
736 pC1 = _mm256_mul_ps(pC1,t);
737 pC0 = _mm256_add_ps(pC0,pC1);
738 pC0 = _mm256_add_ps(pC0,CC0);
739 pC0 = _mm256_mul_ps(pC0,t);
741 /* SELECT pB0 or pC0 for erfc() */
742 mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
743 res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
744 res_erfc = _mm256_mul_ps(res_erfc,expmx2);
746 /* erfc(x<0) = 2-erfc(|x|) */
747 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
748 res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
750 /* Select erf() or erfc() */
751 mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
752 res = _mm256_blendv_ps(_mm256_sub_ps(one,res_erfc),res_erf,mask);
758 /* erf(), 128 bit wide */
760 gmx_mm_erf_ps(__m128 x)
762 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
763 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
764 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
765 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
766 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
767 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
768 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
769 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
770 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
771 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
772 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
773 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
774 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
775 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
776 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
777 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
778 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
779 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
780 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
781 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
782 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
783 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
784 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
785 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
786 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
787 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
788 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
789 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
790 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
791 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
792 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
794 /* Coefficients for expansion of exp(x) in [0,0.1] */
795 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
796 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
797 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
798 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
800 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
801 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
802 const __m128 one = _mm_set1_ps(1.0f);
803 const __m128 two = _mm_set1_ps(2.0f);
806 __m128 z,q,t,t2,w,w2;
807 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
809 __m128 res_erf,res_erfc,res;
812 /* Calculate erf() */
813 x2 = _mm_mul_ps(x,x);
814 x4 = _mm_mul_ps(x2,x2);
816 pA0 = _mm_mul_ps(CA6,x4);
817 pA1 = _mm_mul_ps(CA5,x4);
818 pA0 = _mm_add_ps(pA0,CA4);
819 pA1 = _mm_add_ps(pA1,CA3);
820 pA0 = _mm_mul_ps(pA0,x4);
821 pA1 = _mm_mul_ps(pA1,x4);
822 pA0 = _mm_add_ps(pA0,CA2);
823 pA1 = _mm_add_ps(pA1,CA1);
824 pA0 = _mm_mul_ps(pA0,x4);
825 pA1 = _mm_mul_ps(pA1,x2);
826 pA0 = _mm_add_ps(pA0,pA1);
827 pA0 = _mm_add_ps(pA0,CA0);
829 res_erf = _mm_mul_ps(x,pA0);
833 y = gmx_mm_abs_ps(x);
834 t = gmx_mm_inv_ps(y);
835 w = _mm_sub_ps(t,one);
836 t2 = _mm_mul_ps(t,t);
837 w2 = _mm_mul_ps(w,w);
839 * We cannot simply calculate exp(-x2) directly in single precision, since
840 * that will lose a couple of bits of precision due to the multiplication.
841 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
842 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
844 * The only drawback with this is that it requires TWO separate exponential
845 * evaluations, which would be horrible performance-wise. However, the argument
846 * for the second exp() call is always small, so there we simply use a
847 * low-order minimax expansion on [0,0.1].
850 z = _mm_and_ps(y,sieve);
851 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
853 corr = _mm_mul_ps(CD4,q);
854 corr = _mm_add_ps(corr,CD3);
855 corr = _mm_mul_ps(corr,q);
856 corr = _mm_add_ps(corr,CD2);
857 corr = _mm_mul_ps(corr,q);
858 corr = _mm_add_ps(corr,one);
859 corr = _mm_mul_ps(corr,q);
860 corr = _mm_add_ps(corr,one);
862 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
863 expmx2 = _mm_mul_ps(expmx2,corr);
865 pB1 = _mm_mul_ps(CB9,w2);
866 pB0 = _mm_mul_ps(CB8,w2);
867 pB1 = _mm_add_ps(pB1,CB7);
868 pB0 = _mm_add_ps(pB0,CB6);
869 pB1 = _mm_mul_ps(pB1,w2);
870 pB0 = _mm_mul_ps(pB0,w2);
871 pB1 = _mm_add_ps(pB1,CB5);
872 pB0 = _mm_add_ps(pB0,CB4);
873 pB1 = _mm_mul_ps(pB1,w2);
874 pB0 = _mm_mul_ps(pB0,w2);
875 pB1 = _mm_add_ps(pB1,CB3);
876 pB0 = _mm_add_ps(pB0,CB2);
877 pB1 = _mm_mul_ps(pB1,w2);
878 pB0 = _mm_mul_ps(pB0,w2);
879 pB1 = _mm_add_ps(pB1,CB1);
880 pB1 = _mm_mul_ps(pB1,w);
881 pB0 = _mm_add_ps(pB0,pB1);
882 pB0 = _mm_add_ps(pB0,CB0);
884 pC0 = _mm_mul_ps(CC10,t2);
885 pC1 = _mm_mul_ps(CC9,t2);
886 pC0 = _mm_add_ps(pC0,CC8);
887 pC1 = _mm_add_ps(pC1,CC7);
888 pC0 = _mm_mul_ps(pC0,t2);
889 pC1 = _mm_mul_ps(pC1,t2);
890 pC0 = _mm_add_ps(pC0,CC6);
891 pC1 = _mm_add_ps(pC1,CC5);
892 pC0 = _mm_mul_ps(pC0,t2);
893 pC1 = _mm_mul_ps(pC1,t2);
894 pC0 = _mm_add_ps(pC0,CC4);
895 pC1 = _mm_add_ps(pC1,CC3);
896 pC0 = _mm_mul_ps(pC0,t2);
897 pC1 = _mm_mul_ps(pC1,t2);
898 pC0 = _mm_add_ps(pC0,CC2);
899 pC1 = _mm_add_ps(pC1,CC1);
900 pC0 = _mm_mul_ps(pC0,t2);
901 pC1 = _mm_mul_ps(pC1,t);
902 pC0 = _mm_add_ps(pC0,pC1);
903 pC0 = _mm_add_ps(pC0,CC0);
904 pC0 = _mm_mul_ps(pC0,t);
906 /* SELECT pB0 or pC0 for erfc() */
907 mask = _mm_cmplt_ps(two,y);
908 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
909 res_erfc = _mm_mul_ps(res_erfc,expmx2);
911 /* erfc(x<0) = 2-erfc(|x|) */
912 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
913 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
915 /* Select erf() or erfc() */
916 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
917 res = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
925 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
927 gmx_mm256_erfc_ps(__m256 x)
929 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
930 const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
931 const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
932 const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
933 const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
934 const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
935 const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
936 const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
937 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
938 const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
939 const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
940 const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
941 const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
942 const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
943 const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
944 const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
945 const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
946 const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
947 const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
948 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
949 const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
950 const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
951 const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
952 const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
953 const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
954 const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
955 const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
956 const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
957 const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
958 const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
959 const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
961 /* Coefficients for expansion of exp(x) in [0,0.1] */
962 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
963 const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
964 const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
965 const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
967 const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
968 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
969 const __m256 one = _mm256_set1_ps(1.0f);
970 const __m256 two = _mm256_set1_ps(2.0f);
973 __m256 z,q,t,t2,w,w2;
974 __m256 pA0,pA1,pB0,pB1,pC0,pC1;
976 __m256 res_erf,res_erfc,res;
979 /* Calculate erf() */
980 x2 = _mm256_mul_ps(x,x);
981 x4 = _mm256_mul_ps(x2,x2);
983 pA0 = _mm256_mul_ps(CA6,x4);
984 pA1 = _mm256_mul_ps(CA5,x4);
985 pA0 = _mm256_add_ps(pA0,CA4);
986 pA1 = _mm256_add_ps(pA1,CA3);
987 pA0 = _mm256_mul_ps(pA0,x4);
988 pA1 = _mm256_mul_ps(pA1,x4);
989 pA0 = _mm256_add_ps(pA0,CA2);
990 pA1 = _mm256_add_ps(pA1,CA1);
991 pA0 = _mm256_mul_ps(pA0,x4);
992 pA1 = _mm256_mul_ps(pA1,x2);
993 pA0 = _mm256_add_ps(pA0,pA1);
994 pA0 = _mm256_add_ps(pA0,CA0);
996 res_erf = _mm256_mul_ps(x,pA0);
999 y = gmx_mm256_abs_ps(x);
1000 t = gmx_mm256_inv_ps(y);
1001 w = _mm256_sub_ps(t,one);
1002 t2 = _mm256_mul_ps(t,t);
1003 w2 = _mm256_mul_ps(w,w);
1005 * We cannot simply calculate exp(-x2) directly in single precision, since
1006 * that will lose a couple of bits of precision due to the multiplication.
1007 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1008 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1010 * The only drawback with this is that it requires TWO separate exponential
1011 * evaluations, which would be horrible performance-wise. However, the argument
1012 * for the second exp() call is always small, so there we simply use a
1013 * low-order minimax expansion on [0,0.1].
1016 z = _mm256_and_ps(y,sieve);
1017 q = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
1019 corr = _mm256_mul_ps(CD4,q);
1020 corr = _mm256_add_ps(corr,CD3);
1021 corr = _mm256_mul_ps(corr,q);
1022 corr = _mm256_add_ps(corr,CD2);
1023 corr = _mm256_mul_ps(corr,q);
1024 corr = _mm256_add_ps(corr,one);
1025 corr = _mm256_mul_ps(corr,q);
1026 corr = _mm256_add_ps(corr,one);
1028 expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
1029 expmx2 = _mm256_mul_ps(expmx2,corr);
1031 pB1 = _mm256_mul_ps(CB9,w2);
1032 pB0 = _mm256_mul_ps(CB8,w2);
1033 pB1 = _mm256_add_ps(pB1,CB7);
1034 pB0 = _mm256_add_ps(pB0,CB6);
1035 pB1 = _mm256_mul_ps(pB1,w2);
1036 pB0 = _mm256_mul_ps(pB0,w2);
1037 pB1 = _mm256_add_ps(pB1,CB5);
1038 pB0 = _mm256_add_ps(pB0,CB4);
1039 pB1 = _mm256_mul_ps(pB1,w2);
1040 pB0 = _mm256_mul_ps(pB0,w2);
1041 pB1 = _mm256_add_ps(pB1,CB3);
1042 pB0 = _mm256_add_ps(pB0,CB2);
1043 pB1 = _mm256_mul_ps(pB1,w2);
1044 pB0 = _mm256_mul_ps(pB0,w2);
1045 pB1 = _mm256_add_ps(pB1,CB1);
1046 pB1 = _mm256_mul_ps(pB1,w);
1047 pB0 = _mm256_add_ps(pB0,pB1);
1048 pB0 = _mm256_add_ps(pB0,CB0);
1050 pC0 = _mm256_mul_ps(CC10,t2);
1051 pC1 = _mm256_mul_ps(CC9,t2);
1052 pC0 = _mm256_add_ps(pC0,CC8);
1053 pC1 = _mm256_add_ps(pC1,CC7);
1054 pC0 = _mm256_mul_ps(pC0,t2);
1055 pC1 = _mm256_mul_ps(pC1,t2);
1056 pC0 = _mm256_add_ps(pC0,CC6);
1057 pC1 = _mm256_add_ps(pC1,CC5);
1058 pC0 = _mm256_mul_ps(pC0,t2);
1059 pC1 = _mm256_mul_ps(pC1,t2);
1060 pC0 = _mm256_add_ps(pC0,CC4);
1061 pC1 = _mm256_add_ps(pC1,CC3);
1062 pC0 = _mm256_mul_ps(pC0,t2);
1063 pC1 = _mm256_mul_ps(pC1,t2);
1064 pC0 = _mm256_add_ps(pC0,CC2);
1065 pC1 = _mm256_add_ps(pC1,CC1);
1066 pC0 = _mm256_mul_ps(pC0,t2);
1067 pC1 = _mm256_mul_ps(pC1,t);
1068 pC0 = _mm256_add_ps(pC0,pC1);
1069 pC0 = _mm256_add_ps(pC0,CC0);
1070 pC0 = _mm256_mul_ps(pC0,t);
1072 /* SELECT pB0 or pC0 for erfc() */
1073 mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
1074 res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
1075 res_erfc = _mm256_mul_ps(res_erfc,expmx2);
1077 /* erfc(x<0) = 2-erfc(|x|) */
1078 mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
1079 res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
1081 /* Select erf() or erfc() */
1082 mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
1083 res = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(one,res_erf),mask);
1089 /* erfc(), 128 bit wide */
1091 gmx_mm_erfc_ps(__m128 x)
1093 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1094 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
1095 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
1096 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
1097 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
1098 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
1099 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
1100 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
1101 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1102 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
1103 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
1104 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
1105 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
1106 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
1107 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
1108 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
1109 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
1110 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
1111 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
1112 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1113 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
1114 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
1115 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
1116 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
1117 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
1118 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
1119 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
1120 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
1121 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
1122 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
1123 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
1125 /* Coefficients for expansion of exp(x) in [0,0.1] */
1126 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1127 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
1128 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
1129 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
1131 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1132 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1133 const __m128 one = _mm_set1_ps(1.0f);
1134 const __m128 two = _mm_set1_ps(2.0f);
1137 __m128 z,q,t,t2,w,w2;
1138 __m128 pA0,pA1,pB0,pB1,pC0,pC1;
1140 __m128 res_erf,res_erfc,res;
1143 /* Calculate erf() */
1144 x2 = _mm_mul_ps(x,x);
1145 x4 = _mm_mul_ps(x2,x2);
1147 pA0 = _mm_mul_ps(CA6,x4);
1148 pA1 = _mm_mul_ps(CA5,x4);
1149 pA0 = _mm_add_ps(pA0,CA4);
1150 pA1 = _mm_add_ps(pA1,CA3);
1151 pA0 = _mm_mul_ps(pA0,x4);
1152 pA1 = _mm_mul_ps(pA1,x4);
1153 pA0 = _mm_add_ps(pA0,CA2);
1154 pA1 = _mm_add_ps(pA1,CA1);
1155 pA0 = _mm_mul_ps(pA0,x4);
1156 pA1 = _mm_mul_ps(pA1,x2);
1157 pA0 = _mm_add_ps(pA0,pA1);
1158 pA0 = _mm_add_ps(pA0,CA0);
1160 res_erf = _mm_mul_ps(x,pA0);
1162 /* Calculate erfc */
1163 y = gmx_mm_abs_ps(x);
1164 t = gmx_mm_inv_ps(y);
1165 w = _mm_sub_ps(t,one);
1166 t2 = _mm_mul_ps(t,t);
1167 w2 = _mm_mul_ps(w,w);
1169 * We cannot simply calculate exp(-x2) directly in single precision, since
1170 * that will lose a couple of bits of precision due to the multiplication.
1171 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1172 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1174 * The only drawback with this is that it requires TWO separate exponential
1175 * evaluations, which would be horrible performance-wise. However, the argument
1176 * for the second exp() call is always small, so there we simply use a
1177 * low-order minimax expansion on [0,0.1].
1180 z = _mm_and_ps(y,sieve);
1181 q = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
1183 corr = _mm_mul_ps(CD4,q);
1184 corr = _mm_add_ps(corr,CD3);
1185 corr = _mm_mul_ps(corr,q);
1186 corr = _mm_add_ps(corr,CD2);
1187 corr = _mm_mul_ps(corr,q);
1188 corr = _mm_add_ps(corr,one);
1189 corr = _mm_mul_ps(corr,q);
1190 corr = _mm_add_ps(corr,one);
1192 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
1193 expmx2 = _mm_mul_ps(expmx2,corr);
1195 pB1 = _mm_mul_ps(CB9,w2);
1196 pB0 = _mm_mul_ps(CB8,w2);
1197 pB1 = _mm_add_ps(pB1,CB7);
1198 pB0 = _mm_add_ps(pB0,CB6);
1199 pB1 = _mm_mul_ps(pB1,w2);
1200 pB0 = _mm_mul_ps(pB0,w2);
1201 pB1 = _mm_add_ps(pB1,CB5);
1202 pB0 = _mm_add_ps(pB0,CB4);
1203 pB1 = _mm_mul_ps(pB1,w2);
1204 pB0 = _mm_mul_ps(pB0,w2);
1205 pB1 = _mm_add_ps(pB1,CB3);
1206 pB0 = _mm_add_ps(pB0,CB2);
1207 pB1 = _mm_mul_ps(pB1,w2);
1208 pB0 = _mm_mul_ps(pB0,w2);
1209 pB1 = _mm_add_ps(pB1,CB1);
1210 pB1 = _mm_mul_ps(pB1,w);
1211 pB0 = _mm_add_ps(pB0,pB1);
1212 pB0 = _mm_add_ps(pB0,CB0);
1214 pC0 = _mm_mul_ps(CC10,t2);
1215 pC1 = _mm_mul_ps(CC9,t2);
1216 pC0 = _mm_add_ps(pC0,CC8);
1217 pC1 = _mm_add_ps(pC1,CC7);
1218 pC0 = _mm_mul_ps(pC0,t2);
1219 pC1 = _mm_mul_ps(pC1,t2);
1220 pC0 = _mm_add_ps(pC0,CC6);
1221 pC1 = _mm_add_ps(pC1,CC5);
1222 pC0 = _mm_mul_ps(pC0,t2);
1223 pC1 = _mm_mul_ps(pC1,t2);
1224 pC0 = _mm_add_ps(pC0,CC4);
1225 pC1 = _mm_add_ps(pC1,CC3);
1226 pC0 = _mm_mul_ps(pC0,t2);
1227 pC1 = _mm_mul_ps(pC1,t2);
1228 pC0 = _mm_add_ps(pC0,CC2);
1229 pC1 = _mm_add_ps(pC1,CC1);
1230 pC0 = _mm_mul_ps(pC0,t2);
1231 pC1 = _mm_mul_ps(pC1,t);
1232 pC0 = _mm_add_ps(pC0,pC1);
1233 pC0 = _mm_add_ps(pC0,CC0);
1234 pC0 = _mm_mul_ps(pC0,t);
1236 /* SELECT pB0 or pC0 for erfc() */
1237 mask = _mm_cmplt_ps(two,y);
1238 res_erfc = _mm_blendv_ps(pB0,pC0,mask);
1239 res_erfc = _mm_mul_ps(res_erfc,expmx2);
1241 /* erfc(x<0) = 2-erfc(|x|) */
1242 mask = _mm_cmplt_ps(x,_mm_setzero_ps());
1243 res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
1245 /* Select erf() or erfc() */
1246 mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
1247 res = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
1254 /* Calculate the force correction due to PME analytically.
1256 * This routine is meant to enable analytical evaluation of the
1257 * direct-space PME electrostatic force to avoid tables.
1259 * The direct-space potential should be Erfc(beta*r)/r, but there
1260 * are some problems evaluating that:
1262 * First, the error function is difficult (read: expensive) to
1263 * approxmiate accurately for intermediate to large arguments, and
1264 * this happens already in ranges of beta*r that occur in simulations.
1265 * Second, we now try to avoid calculating potentials in Gromacs but
1266 * use forces directly.
1268 * We can simply things slight by noting that the PME part is really
1269 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1271 * V= 1/r - Erf(beta*r)/r
1273 * The first term we already have from the inverse square root, so
1274 * that we can leave out of this routine.
1276 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1277 * the argument beta*r will be in the range 0.15 to ~4. Use your
1278 * favorite plotting program to realize how well-behaved Erf(z)/z is
1281 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1282 * However, it turns out it is more efficient to approximate f(z)/z and
1283 * then only use even powers. This is another minor optimization, since
1284 * we actually WANT f(z)/z, because it is going to be multiplied by
1285 * the vector between the two atoms to get the vectorial force. The
1286 * fastest flops are the ones we can avoid calculating!
1288 * So, here's how it should be used:
1291 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1292 * 3. Evaluate this routine with z^2 as the argument.
1293 * 4. The return value is the expression:
1296 * 2*exp(-z^2) erf(z)
1297 * ------------ - --------
1300 * 5. Multiply the entire expression by beta^3. This will get you
1302 * beta^3*2*exp(-z^2) beta^3*erf(z)
1303 * ------------------ - ---------------
1306 * or, switching back to r (z=r*beta):
1308 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
1309 * ----------------------- - -----------
1313 * With a bit of math exercise you should be able to confirm that
1314 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1316 * 6. Add the result to 1/r^3, multiply by the product of the charges,
1317 * and you have your force (divided by r). A final multiplication
1318 * with the vector connecting the two particles and you have your
1319 * vectorial force to add to the particles.
1323 gmx_mm256_pmecorrF_ps(__m256 z2)
1325 const __m256 FN6 = _mm256_set1_ps(-1.7357322914161492954e-8f);
1326 const __m256 FN5 = _mm256_set1_ps(1.4703624142580877519e-6f);
1327 const __m256 FN4 = _mm256_set1_ps(-0.000053401640219807709149f);
1328 const __m256 FN3 = _mm256_set1_ps(0.0010054721316683106153f);
1329 const __m256 FN2 = _mm256_set1_ps(-0.019278317264888380590f);
1330 const __m256 FN1 = _mm256_set1_ps(0.069670166153766424023f);
1331 const __m256 FN0 = _mm256_set1_ps(-0.75225204789749321333f);
1333 const __m256 FD4 = _mm256_set1_ps(0.0011193462567257629232f);
1334 const __m256 FD3 = _mm256_set1_ps(0.014866955030185295499f);
1335 const __m256 FD2 = _mm256_set1_ps(0.11583842382862377919f);
1336 const __m256 FD1 = _mm256_set1_ps(0.50736591960530292870f);
1337 const __m256 FD0 = _mm256_set1_ps(1.0f);
1340 __m256 polyFN0,polyFN1,polyFD0,polyFD1;
1342 z4 = _mm256_mul_ps(z2,z2);
1344 polyFD0 = _mm256_mul_ps(FD4,z4);
1345 polyFD1 = _mm256_mul_ps(FD3,z4);
1346 polyFD0 = _mm256_add_ps(polyFD0,FD2);
1347 polyFD1 = _mm256_add_ps(polyFD1,FD1);
1348 polyFD0 = _mm256_mul_ps(polyFD0,z4);
1349 polyFD1 = _mm256_mul_ps(polyFD1,z2);
1350 polyFD0 = _mm256_add_ps(polyFD0,FD0);
1351 polyFD0 = _mm256_add_ps(polyFD0,polyFD1);
1353 polyFD0 = gmx_mm256_inv_ps(polyFD0);
1355 polyFN0 = _mm256_mul_ps(FN6,z4);
1356 polyFN1 = _mm256_mul_ps(FN5,z4);
1357 polyFN0 = _mm256_add_ps(polyFN0,FN4);
1358 polyFN1 = _mm256_add_ps(polyFN1,FN3);
1359 polyFN0 = _mm256_mul_ps(polyFN0,z4);
1360 polyFN1 = _mm256_mul_ps(polyFN1,z4);
1361 polyFN0 = _mm256_add_ps(polyFN0,FN2);
1362 polyFN1 = _mm256_add_ps(polyFN1,FN1);
1363 polyFN0 = _mm256_mul_ps(polyFN0,z4);
1364 polyFN1 = _mm256_mul_ps(polyFN1,z2);
1365 polyFN0 = _mm256_add_ps(polyFN0,FN0);
1366 polyFN0 = _mm256_add_ps(polyFN0,polyFN1);
1368 return _mm256_mul_ps(polyFN0,polyFD0);
1373 gmx_mm_pmecorrF_ps(__m128 z2)
1375 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
1376 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
1377 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
1378 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
1379 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
1380 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
1381 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
1383 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
1384 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
1385 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
1386 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
1387 const __m128 FD0 = _mm_set1_ps(1.0f);
1390 __m128 polyFN0,polyFN1,polyFD0,polyFD1;
1392 z4 = _mm_mul_ps(z2,z2);
1394 polyFD0 = _mm_mul_ps(FD4,z4);
1395 polyFD1 = _mm_mul_ps(FD3,z4);
1396 polyFD0 = _mm_add_ps(polyFD0,FD2);
1397 polyFD1 = _mm_add_ps(polyFD1,FD1);
1398 polyFD0 = _mm_mul_ps(polyFD0,z4);
1399 polyFD1 = _mm_mul_ps(polyFD1,z2);
1400 polyFD0 = _mm_add_ps(polyFD0,FD0);
1401 polyFD0 = _mm_add_ps(polyFD0,polyFD1);
1403 polyFD0 = gmx_mm_inv_ps(polyFD0);
1405 polyFN0 = _mm_mul_ps(FN6,z4);
1406 polyFN1 = _mm_mul_ps(FN5,z4);
1407 polyFN0 = _mm_add_ps(polyFN0,FN4);
1408 polyFN1 = _mm_add_ps(polyFN1,FN3);
1409 polyFN0 = _mm_mul_ps(polyFN0,z4);
1410 polyFN1 = _mm_mul_ps(polyFN1,z4);
1411 polyFN0 = _mm_add_ps(polyFN0,FN2);
1412 polyFN1 = _mm_add_ps(polyFN1,FN1);
1413 polyFN0 = _mm_mul_ps(polyFN0,z4);
1414 polyFN1 = _mm_mul_ps(polyFN1,z2);
1415 polyFN0 = _mm_add_ps(polyFN0,FN0);
1416 polyFN0 = _mm_add_ps(polyFN0,polyFN1);
1418 return _mm_mul_ps(polyFN0,polyFD0);
1423 /* Calculate the potential correction due to PME analytically.
1425 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1427 * This routine calculates Erf(z)/z, although you should provide z^2
1428 * as the input argument.
1430 * Here's how it should be used:
1433 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1434 * 3. Evaluate this routine with z^2 as the argument.
1435 * 4. The return value is the expression:
1442 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1448 * 6. Subtract the result from 1/r, multiply by the product of the charges,
1449 * and you have your potential.
1452 gmx_mm256_pmecorrV_ps(__m256 z2)
1454 const __m256 VN6 = _mm256_set1_ps(1.9296833005951166339e-8f);
1455 const __m256 VN5 = _mm256_set1_ps(-1.4213390571557850962e-6f);
1456 const __m256 VN4 = _mm256_set1_ps(0.000041603292906656984871f);
1457 const __m256 VN3 = _mm256_set1_ps(-0.00013134036773265025626f);
1458 const __m256 VN2 = _mm256_set1_ps(0.038657983986041781264f);
1459 const __m256 VN1 = _mm256_set1_ps(0.11285044772717598220f);
1460 const __m256 VN0 = _mm256_set1_ps(1.1283802385263030286f);
1462 const __m256 VD3 = _mm256_set1_ps(0.0066752224023576045451f);
1463 const __m256 VD2 = _mm256_set1_ps(0.078647795836373922256f);
1464 const __m256 VD1 = _mm256_set1_ps(0.43336185284710920150f);
1465 const __m256 VD0 = _mm256_set1_ps(1.0f);
1468 __m256 polyVN0,polyVN1,polyVD0,polyVD1;
1470 z4 = _mm256_mul_ps(z2,z2);
1472 polyVD1 = _mm256_mul_ps(VD3,z4);
1473 polyVD0 = _mm256_mul_ps(VD2,z4);
1474 polyVD1 = _mm256_add_ps(polyVD1,VD1);
1475 polyVD0 = _mm256_add_ps(polyVD0,VD0);
1476 polyVD1 = _mm256_mul_ps(polyVD1,z2);
1477 polyVD0 = _mm256_add_ps(polyVD0,polyVD1);
1479 polyVD0 = gmx_mm256_inv_ps(polyVD0);
1481 polyVN0 = _mm256_mul_ps(VN6,z4);
1482 polyVN1 = _mm256_mul_ps(VN5,z4);
1483 polyVN0 = _mm256_add_ps(polyVN0,VN4);
1484 polyVN1 = _mm256_add_ps(polyVN1,VN3);
1485 polyVN0 = _mm256_mul_ps(polyVN0,z4);
1486 polyVN1 = _mm256_mul_ps(polyVN1,z4);
1487 polyVN0 = _mm256_add_ps(polyVN0,VN2);
1488 polyVN1 = _mm256_add_ps(polyVN1,VN1);
1489 polyVN0 = _mm256_mul_ps(polyVN0,z4);
1490 polyVN1 = _mm256_mul_ps(polyVN1,z2);
1491 polyVN0 = _mm256_add_ps(polyVN0,VN0);
1492 polyVN0 = _mm256_add_ps(polyVN0,polyVN1);
1494 return _mm256_mul_ps(polyVN0,polyVD0);
1499 gmx_mm_pmecorrV_ps(__m128 z2)
1501 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
1502 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
1503 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
1504 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
1505 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
1506 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
1507 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
1509 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
1510 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
1511 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
1512 const __m128 VD0 = _mm_set1_ps(1.0f);
1515 __m128 polyVN0,polyVN1,polyVD0,polyVD1;
1517 z4 = _mm_mul_ps(z2,z2);
1519 polyVD1 = _mm_mul_ps(VD3,z4);
1520 polyVD0 = _mm_mul_ps(VD2,z4);
1521 polyVD1 = _mm_add_ps(polyVD1,VD1);
1522 polyVD0 = _mm_add_ps(polyVD0,VD0);
1523 polyVD1 = _mm_mul_ps(polyVD1,z2);
1524 polyVD0 = _mm_add_ps(polyVD0,polyVD1);
1526 polyVD0 = gmx_mm_inv_ps(polyVD0);
1528 polyVN0 = _mm_mul_ps(VN6,z4);
1529 polyVN1 = _mm_mul_ps(VN5,z4);
1530 polyVN0 = _mm_add_ps(polyVN0,VN4);
1531 polyVN1 = _mm_add_ps(polyVN1,VN3);
1532 polyVN0 = _mm_mul_ps(polyVN0,z4);
1533 polyVN1 = _mm_mul_ps(polyVN1,z4);
1534 polyVN0 = _mm_add_ps(polyVN0,VN2);
1535 polyVN1 = _mm_add_ps(polyVN1,VN1);
1536 polyVN0 = _mm_mul_ps(polyVN0,z4);
1537 polyVN1 = _mm_mul_ps(polyVN1,z2);
1538 polyVN0 = _mm_add_ps(polyVN0,VN0);
1539 polyVN0 = _mm_add_ps(polyVN0,polyVN1);
1541 return _mm_mul_ps(polyVN0,polyVD0);
1546 gmx_mm256_sincos_ps(__m256 x,
1550 const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1551 const __m256 half = _mm256_set1_ps(0.5f);
1552 const __m256 one = _mm256_set1_ps(1.0f);
1553 const __m256 zero = _mm256_setzero_ps();
1555 const __m128i ione = _mm_set1_epi32(1);
1557 const __m256 mask_one = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1558 const __m256 mask_two = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1559 const __m256 mask_three = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1561 const __m256 CA1 = _mm256_set1_ps(1.5703125f);
1562 const __m256 CA2 = _mm256_set1_ps(4.837512969970703125e-4f);
1563 const __m256 CA3 = _mm256_set1_ps(7.54978995489188216e-8f);
1565 const __m256 CC0 = _mm256_set1_ps(-0.0013602249f);
1566 const __m256 CC1 = _mm256_set1_ps(0.0416566950f);
1567 const __m256 CC2 = _mm256_set1_ps(-0.4999990225f);
1568 const __m256 CS0 = _mm256_set1_ps(-0.0001950727f);
1569 const __m256 CS1 = _mm256_set1_ps(0.0083320758f);
1570 const __m256 CS2 = _mm256_set1_ps(-0.1666665247f);
1572 const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1577 __m128i iz_high,iz_low;
1578 __m256 offset_sin,offset_cos;
1579 __m256 mask_sin,mask_cos;
1581 __m256 tmp_sin,tmp_cos;
1583 y = _mm256_mul_ps(x,two_over_pi);
1584 y = _mm256_add_ps(y,_mm256_or_ps(_mm256_and_ps(y,signbit),half));
1586 iz = _mm256_cvttps_epi32(y);
1587 z = _mm256_round_ps(y,_MM_FROUND_TO_ZERO);
1589 offset_sin = _mm256_and_ps(_mm256_castsi256_ps(iz),mask_three);
1591 iz_high = _mm256_extractf128_si256(iz,0x1);
1592 iz_low = _mm256_castsi256_si128(iz);
1593 iz_low = _mm_add_epi32(iz_low,ione);
1594 iz_high = _mm_add_epi32(iz_high,ione);
1595 iz = _mm256_castsi128_si256(iz_low);
1596 iz = _mm256_insertf128_si256(iz,iz_high,0x1);
1597 offset_cos = _mm256_castsi256_ps(iz);
1599 /* Extended precision arithmethic to achieve full precision */
1600 y = _mm256_mul_ps(z,CA1);
1601 tmp1 = _mm256_mul_ps(z,CA2);
1602 tmp2 = _mm256_mul_ps(z,CA3);
1603 y = _mm256_sub_ps(x,y);
1604 y = _mm256_sub_ps(y,tmp1);
1605 y = _mm256_sub_ps(y,tmp2);
1607 y2 = _mm256_mul_ps(y,y);
1609 tmp1 = _mm256_mul_ps(CC0,y2);
1610 tmp1 = _mm256_add_ps(tmp1,CC1);
1611 tmp2 = _mm256_mul_ps(CS0,y2);
1612 tmp2 = _mm256_add_ps(tmp2,CS1);
1613 tmp1 = _mm256_mul_ps(tmp1,y2);
1614 tmp1 = _mm256_add_ps(tmp1,CC2);
1615 tmp2 = _mm256_mul_ps(tmp2,y2);
1616 tmp2 = _mm256_add_ps(tmp2,CS2);
1618 tmp1 = _mm256_mul_ps(tmp1,y2);
1619 tmp1 = _mm256_add_ps(tmp1,one);
1621 tmp2 = _mm256_mul_ps(tmp2,_mm256_mul_ps(y,y2));
1622 tmp2 = _mm256_add_ps(tmp2,y);
1624 #ifdef __INTEL_COMPILER
1625 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1626 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_one))),zero,_CMP_EQ_OQ);
1627 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_one))),zero,_CMP_EQ_OQ);
1629 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_one), zero, _CMP_EQ_OQ);
1630 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_one), zero, _CMP_EQ_OQ);
1632 tmp_sin = _mm256_blendv_ps(tmp1,tmp2,mask_sin);
1633 tmp_cos = _mm256_blendv_ps(tmp1,tmp2,mask_cos);
1635 tmp1 = _mm256_xor_ps(signbit,tmp_sin);
1636 tmp2 = _mm256_xor_ps(signbit,tmp_cos);
1638 #ifdef __INTEL_COMPILER
1639 /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1640 mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_two))),zero,_CMP_EQ_OQ);
1641 mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_two))),zero,_CMP_EQ_OQ);
1643 mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_two), zero, _CMP_EQ_OQ);
1644 mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_two), zero, _CMP_EQ_OQ);
1647 *sinval = _mm256_blendv_ps(tmp1,tmp_sin,mask_sin);
1648 *cosval = _mm256_blendv_ps(tmp2,tmp_cos,mask_cos);
1654 gmx_mm_sincos_ps(__m128 x,
1658 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1659 const __m128 half = _mm_set1_ps(0.5);
1660 const __m128 one = _mm_set1_ps(1.0);
1662 const __m128i izero = _mm_set1_epi32(0);
1663 const __m128i ione = _mm_set1_epi32(1);
1664 const __m128i itwo = _mm_set1_epi32(2);
1665 const __m128i ithree = _mm_set1_epi32(3);
1666 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1668 const __m128 CA1 = _mm_set1_ps(1.5703125f);
1669 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
1670 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
1672 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
1673 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
1674 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
1675 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
1676 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
1677 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
1682 __m128i offset_sin,offset_cos;
1684 __m128 mask_sin,mask_cos;
1685 __m128 tmp_sin,tmp_cos;
1687 y = _mm_mul_ps(x,two_over_pi);
1688 y = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
1690 iz = _mm_cvttps_epi32(y);
1691 z = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
1693 offset_sin = _mm_and_si128(iz,ithree);
1694 offset_cos = _mm_add_epi32(iz,ione);
1696 /* Extended precision arithmethic to achieve full precision */
1697 y = _mm_mul_ps(z,CA1);
1698 tmp1 = _mm_mul_ps(z,CA2);
1699 tmp2 = _mm_mul_ps(z,CA3);
1700 y = _mm_sub_ps(x,y);
1701 y = _mm_sub_ps(y,tmp1);
1702 y = _mm_sub_ps(y,tmp2);
1704 y2 = _mm_mul_ps(y,y);
1706 tmp1 = _mm_mul_ps(CC0,y2);
1707 tmp1 = _mm_add_ps(tmp1,CC1);
1708 tmp2 = _mm_mul_ps(CS0,y2);
1709 tmp2 = _mm_add_ps(tmp2,CS1);
1710 tmp1 = _mm_mul_ps(tmp1,y2);
1711 tmp1 = _mm_add_ps(tmp1,CC2);
1712 tmp2 = _mm_mul_ps(tmp2,y2);
1713 tmp2 = _mm_add_ps(tmp2,CS2);
1715 tmp1 = _mm_mul_ps(tmp1,y2);
1716 tmp1 = _mm_add_ps(tmp1,one);
1718 tmp2 = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
1719 tmp2 = _mm_add_ps(tmp2,y);
1721 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
1722 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
1724 tmp_sin = _mm_blendv_ps(tmp1,tmp2,mask_sin);
1725 tmp_cos = _mm_blendv_ps(tmp1,tmp2,mask_cos);
1727 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
1728 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
1730 tmp1 = _mm_xor_ps(signbit,tmp_sin);
1731 tmp2 = _mm_xor_ps(signbit,tmp_cos);
1733 *sinval = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
1734 *cosval = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
1743 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1744 * will then call the sincos() routine and waste a factor 2 in performance!
1747 gmx_mm256_sin_ps(__m256 x)
1750 gmx_mm256_sincos_ps(x,&s,&c);
1755 gmx_mm_sin_ps(__m128 x)
1758 gmx_mm_sincos_ps(x,&s,&c);
1764 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1765 * will then call the sincos() routine and waste a factor 2 in performance!
1768 gmx_mm256_cos_ps(__m256 x)
1771 gmx_mm256_sincos_ps(x,&s,&c);
1776 gmx_mm_cos_ps(__m128 x)
1779 gmx_mm_sincos_ps(x,&s,&c);
1785 gmx_mm256_tan_ps(__m256 x)
1787 __m256 sinval,cosval;
1790 gmx_mm256_sincos_ps(x,&sinval,&cosval);
1792 tanval = _mm256_mul_ps(sinval,gmx_mm256_inv_ps(cosval));
1798 gmx_mm_tan_ps(__m128 x)
1800 __m128 sinval,cosval;
1803 gmx_mm_sincos_ps(x,&sinval,&cosval);
1805 tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
1812 gmx_mm256_asin_ps(__m256 x)
1814 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1815 const __m256 limitlow = _mm256_set1_ps(1e-4f);
1816 const __m256 half = _mm256_set1_ps(0.5f);
1817 const __m256 one = _mm256_set1_ps(1.0f);
1818 const __m256 halfpi = _mm256_set1_ps((float)M_PI/2.0f);
1820 const __m256 CC5 = _mm256_set1_ps(4.2163199048E-2f);
1821 const __m256 CC4 = _mm256_set1_ps(2.4181311049E-2f);
1822 const __m256 CC3 = _mm256_set1_ps(4.5470025998E-2f);
1823 const __m256 CC2 = _mm256_set1_ps(7.4953002686E-2f);
1824 const __m256 CC1 = _mm256_set1_ps(1.6666752422E-1f);
1829 __m256 z,z1,z2,q,q1,q2;
1832 sign = _mm256_andnot_ps(signmask,x);
1833 xabs = _mm256_and_ps(x,signmask);
1835 mask = _mm256_cmp_ps(xabs,half,_CMP_GT_OQ);
1837 z1 = _mm256_mul_ps(half, _mm256_sub_ps(one,xabs));
1838 q1 = _mm256_mul_ps(z1,gmx_mm256_invsqrt_ps(z1));
1839 q1 = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
1842 z2 = _mm256_mul_ps(q2,q2);
1844 z = _mm256_blendv_ps(z2,z1,mask);
1845 q = _mm256_blendv_ps(q2,q1,mask);
1847 z2 = _mm256_mul_ps(z,z);
1849 pA = _mm256_mul_ps(CC5,z2);
1850 pB = _mm256_mul_ps(CC4,z2);
1852 pA = _mm256_add_ps(pA,CC3);
1853 pB = _mm256_add_ps(pB,CC2);
1855 pA = _mm256_mul_ps(pA,z2);
1856 pB = _mm256_mul_ps(pB,z2);
1858 pA = _mm256_add_ps(pA,CC1);
1859 pA = _mm256_mul_ps(pA,z);
1861 z = _mm256_add_ps(pA,pB);
1862 z = _mm256_mul_ps(z,q);
1863 z = _mm256_add_ps(z,q);
1865 q2 = _mm256_sub_ps(halfpi,z);
1866 q2 = _mm256_sub_ps(q2,z);
1868 z = _mm256_blendv_ps(z,q2,mask);
1870 mask = _mm256_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
1871 z = _mm256_blendv_ps(xabs,z,mask);
1873 z = _mm256_xor_ps(z,sign);
1879 gmx_mm_asin_ps(__m128 x)
1881 /* Same algorithm as cephes library */
1882 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1883 const __m128 limitlow = _mm_set1_ps(1e-4f);
1884 const __m128 half = _mm_set1_ps(0.5f);
1885 const __m128 one = _mm_set1_ps(1.0f);
1886 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
1888 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
1889 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
1890 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
1891 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
1892 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
1897 __m128 z,z1,z2,q,q1,q2;
1900 sign = _mm_andnot_ps(signmask,x);
1901 xabs = _mm_and_ps(x,signmask);
1903 mask = _mm_cmpgt_ps(xabs,half);
1905 z1 = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
1906 q1 = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
1907 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
1910 z2 = _mm_mul_ps(q2,q2);
1912 z = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
1913 q = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
1915 z2 = _mm_mul_ps(z,z);
1917 pA = _mm_mul_ps(CC5,z2);
1918 pB = _mm_mul_ps(CC4,z2);
1920 pA = _mm_add_ps(pA,CC3);
1921 pB = _mm_add_ps(pB,CC2);
1923 pA = _mm_mul_ps(pA,z2);
1924 pB = _mm_mul_ps(pB,z2);
1926 pA = _mm_add_ps(pA,CC1);
1927 pA = _mm_mul_ps(pA,z);
1929 z = _mm_add_ps(pA,pB);
1930 z = _mm_mul_ps(z,q);
1931 z = _mm_add_ps(z,q);
1933 q2 = _mm_sub_ps(halfpi,z);
1934 q2 = _mm_sub_ps(q2,z);
1936 z = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1938 mask = _mm_cmpgt_ps(xabs,limitlow);
1939 z = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1941 z = _mm_xor_ps(z,sign);
1948 gmx_mm256_acos_ps(__m256 x)
1950 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1951 const __m256 one_ps = _mm256_set1_ps(1.0f);
1952 const __m256 half_ps = _mm256_set1_ps(0.5f);
1953 const __m256 pi_ps = _mm256_set1_ps((float)M_PI);
1954 const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1961 xabs = _mm256_and_ps(x,signmask);
1962 mask1 = _mm256_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
1963 mask2 = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_GT_OQ);
1965 z = _mm256_mul_ps(half_ps,_mm256_sub_ps(one_ps,xabs));
1966 z = _mm256_mul_ps(z,gmx_mm256_invsqrt_ps(z));
1967 z = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
1969 z = _mm256_blendv_ps(x,z,mask1);
1970 z = gmx_mm256_asin_ps(z);
1972 z2 = _mm256_add_ps(z,z);
1973 z1 = _mm256_sub_ps(pi_ps,z2);
1974 z3 = _mm256_sub_ps(halfpi_ps,z);
1976 z = _mm256_blendv_ps(z1,z2,mask2);
1977 z = _mm256_blendv_ps(z3,z,mask1);
1983 gmx_mm_acos_ps(__m128 x)
1985 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1986 const __m128 one_ps = _mm_set1_ps(1.0f);
1987 const __m128 half_ps = _mm_set1_ps(0.5f);
1988 const __m128 pi_ps = _mm_set1_ps(M_PI);
1989 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1996 xabs = _mm_and_ps(x,signmask);
1997 mask1 = _mm_cmpgt_ps(xabs,half_ps);
1998 mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
2000 z = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
2001 z = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
2002 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
2004 z = _mm_blendv_ps(x,z,mask1);
2005 z = gmx_mm_asin_ps(z);
2007 z2 = _mm_add_ps(z,z);
2008 z1 = _mm_sub_ps(pi_ps,z2);
2009 z3 = _mm_sub_ps(halfpi_ps,z);
2011 z = _mm_blendv_ps(z1,z2,mask2);
2012 z = _mm_blendv_ps(z3,z,mask1);
2019 gmx_mm256_atan_ps(__m256 x)
2021 const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2022 const __m256 limit1 = _mm256_set1_ps(0.414213562373095f);
2023 const __m256 limit2 = _mm256_set1_ps(2.414213562373095f);
2024 const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2025 const __m256 halfpi = _mm256_set1_ps(1.570796326794896f);
2026 const __m256 mone = _mm256_set1_ps(-1.0f);
2027 const __m256 CC3 = _mm256_set1_ps(-3.33329491539E-1f);
2028 const __m256 CC5 = _mm256_set1_ps(1.99777106478E-1f);
2029 const __m256 CC7 = _mm256_set1_ps(-1.38776856032E-1);
2030 const __m256 CC9 = _mm256_set1_ps(8.05374449538e-2f);
2038 sign = _mm256_andnot_ps(signmask,x);
2039 x = _mm256_and_ps(x,signmask);
2041 mask1 = _mm256_cmp_ps(x,limit1,_CMP_GT_OQ);
2042 mask2 = _mm256_cmp_ps(x,limit2,_CMP_GT_OQ);
2044 z1 = _mm256_mul_ps(_mm256_add_ps(x,mone),gmx_mm256_inv_ps(_mm256_sub_ps(x,mone)));
2045 z2 = _mm256_mul_ps(mone,gmx_mm256_inv_ps(x));
2047 y = _mm256_and_ps(mask1,quarterpi);
2048 y = _mm256_blendv_ps(y,halfpi,mask2);
2050 x = _mm256_blendv_ps(x,z1,mask1);
2051 x = _mm256_blendv_ps(x,z2,mask2);
2053 x2 = _mm256_mul_ps(x,x);
2054 x4 = _mm256_mul_ps(x2,x2);
2056 sum1 = _mm256_mul_ps(CC9,x4);
2057 sum2 = _mm256_mul_ps(CC7,x4);
2058 sum1 = _mm256_add_ps(sum1,CC5);
2059 sum2 = _mm256_add_ps(sum2,CC3);
2060 sum1 = _mm256_mul_ps(sum1,x4);
2061 sum2 = _mm256_mul_ps(sum2,x2);
2063 sum1 = _mm256_add_ps(sum1,sum2);
2064 sum1 = _mm256_sub_ps(sum1,mone);
2065 sum1 = _mm256_mul_ps(sum1,x);
2066 y = _mm256_add_ps(y,sum1);
2068 y = _mm256_xor_ps(y,sign);
2074 gmx_mm_atan_ps(__m128 x)
2076 /* Same algorithm as cephes library */
2077 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2078 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
2079 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
2080 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2081 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
2082 const __m128 mone = _mm_set1_ps(-1.0f);
2083 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
2084 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
2085 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
2086 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
2094 sign = _mm_andnot_ps(signmask,x);
2095 x = _mm_and_ps(x,signmask);
2097 mask1 = _mm_cmpgt_ps(x,limit1);
2098 mask2 = _mm_cmpgt_ps(x,limit2);
2100 z1 = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
2101 z2 = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
2103 y = _mm_and_ps(mask1,quarterpi);
2104 y = _mm_blendv_ps(y,halfpi,mask2);
2106 x = _mm_blendv_ps(x,z1,mask1);
2107 x = _mm_blendv_ps(x,z2,mask2);
2109 x2 = _mm_mul_ps(x,x);
2110 x4 = _mm_mul_ps(x2,x2);
2112 sum1 = _mm_mul_ps(CC9,x4);
2113 sum2 = _mm_mul_ps(CC7,x4);
2114 sum1 = _mm_add_ps(sum1,CC5);
2115 sum2 = _mm_add_ps(sum2,CC3);
2116 sum1 = _mm_mul_ps(sum1,x4);
2117 sum2 = _mm_mul_ps(sum2,x2);
2119 sum1 = _mm_add_ps(sum1,sum2);
2120 sum1 = _mm_sub_ps(sum1,mone);
2121 sum1 = _mm_mul_ps(sum1,x);
2122 y = _mm_add_ps(y,sum1);
2124 y = _mm_xor_ps(y,sign);
2131 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2133 const __m256 pi = _mm256_set1_ps( (float) M_PI);
2134 const __m256 minuspi = _mm256_set1_ps( (float) -M_PI);
2135 const __m256 halfpi = _mm256_set1_ps( (float) M_PI/2.0f);
2136 const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2140 __m256 maskx_lt,maskx_eq;
2141 __m256 masky_lt,masky_eq;
2142 __m256 mask1,mask2,mask3,mask4,maskall;
2144 maskx_lt = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
2145 masky_lt = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_LT_OQ);
2146 maskx_eq = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
2147 masky_eq = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_EQ_OQ);
2149 z = _mm256_mul_ps(y,gmx_mm256_inv_ps(x));
2150 z = gmx_mm256_atan_ps(z);
2152 mask1 = _mm256_and_ps(maskx_eq,masky_lt);
2153 mask2 = _mm256_andnot_ps(maskx_lt,masky_eq);
2154 mask3 = _mm256_andnot_ps( _mm256_or_ps(masky_lt,masky_eq) , maskx_eq);
2155 mask4 = _mm256_and_ps(maskx_lt,masky_eq);
2156 maskall = _mm256_or_ps( _mm256_or_ps(mask1,mask2), _mm256_or_ps(mask3,mask4) );
2158 z = _mm256_andnot_ps(maskall,z);
2159 z1 = _mm256_and_ps(mask1,minushalfpi);
2160 z3 = _mm256_and_ps(mask3,halfpi);
2161 z4 = _mm256_and_ps(mask4,pi);
2163 z = _mm256_or_ps( _mm256_or_ps(z,z1), _mm256_or_ps(z3,z4) );
2165 w = _mm256_blendv_ps(pi,minuspi,masky_lt);
2166 w = _mm256_and_ps(w,maskx_lt);
2168 w = _mm256_andnot_ps(maskall,w);
2170 z = _mm256_add_ps(z,w);
2176 gmx_mm_atan2_ps(__m128 y, __m128 x)
2178 const __m128 pi = _mm_set1_ps(M_PI);
2179 const __m128 minuspi = _mm_set1_ps(-M_PI);
2180 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
2181 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2185 __m128 maskx_lt,maskx_eq;
2186 __m128 masky_lt,masky_eq;
2187 __m128 mask1,mask2,mask3,mask4,maskall;
2189 maskx_lt = _mm_cmplt_ps(x,_mm_setzero_ps());
2190 masky_lt = _mm_cmplt_ps(y,_mm_setzero_ps());
2191 maskx_eq = _mm_cmpeq_ps(x,_mm_setzero_ps());
2192 masky_eq = _mm_cmpeq_ps(y,_mm_setzero_ps());
2194 z = _mm_mul_ps(y,gmx_mm_inv_ps(x));
2195 z = gmx_mm_atan_ps(z);
2197 mask1 = _mm_and_ps(maskx_eq,masky_lt);
2198 mask2 = _mm_andnot_ps(maskx_lt,masky_eq);
2199 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
2200 mask4 = _mm_and_ps(masky_eq,maskx_lt);
2202 maskall = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
2204 z = _mm_andnot_ps(maskall,z);
2205 z1 = _mm_and_ps(mask1,minushalfpi);
2206 z3 = _mm_and_ps(mask3,halfpi);
2207 z4 = _mm_and_ps(mask4,pi);
2209 z = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
2211 mask1 = _mm_andnot_ps(masky_lt,maskx_lt);
2212 mask2 = _mm_and_ps(maskx_lt,masky_lt);
2214 w = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
2215 w = _mm_andnot_ps(maskall,w);
2217 z = _mm_add_ps(z,w);
2222 #endif /* _gmx_math_x86_avx_256_single_h_ */