1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of GROMACS.
7 * Written by the Gromacs development team under coordination of
8 * David van der Spoel, Berk Hess, and Erik Lindahl.
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2
13 * of the License, or (at your option) any later version.
15 * To help us fund GROMACS development, we humbly ask that you cite
16 * the research papers on the package. Check out http://www.gromacs.org
19 * Gnomes, ROck Monsters And Chili Sauce
21 #ifndef _gmx_math_x86_sse2_single_h_
22 #define _gmx_math_x86_sse2_single_h_
28 #include "gmx_x86_sse2.h"
32 # define M_PI 3.14159265358979323846264338327950288
37 /************************
39 * Simple math routines *
41 ************************/
44 static gmx_inline __m128
45 gmx_mm_invsqrt_ps(__m128 x)
47 const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
48 const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
50 __m128 lu = _mm_rsqrt_ps(x);
52 return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
55 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
56 static gmx_inline __m128
57 gmx_mm_sqrt_ps(__m128 x)
62 mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
63 res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
65 res = _mm_mul_ps(x, res);
71 static gmx_inline __m128
72 gmx_mm_inv_ps(__m128 x)
74 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
76 __m128 lu = _mm_rcp_ps(x);
78 return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
81 static gmx_inline __m128
82 gmx_mm_abs_ps(__m128 x)
84 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
86 return _mm_and_ps(x, signmask);
91 gmx_mm_log_ps(__m128 x)
93 /* Same algorithm as cephes library */
94 const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
95 const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
96 const __m128 half = _mm_set1_ps(0.5f);
97 const __m128 one = _mm_set1_ps(1.0f);
98 const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
99 const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
100 const __m128 corr2 = _mm_set1_ps(0.693359375f);
102 const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
103 const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
104 const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
105 const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
106 const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
107 const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
108 const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
109 const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
110 const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
117 __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
119 /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
120 fexp = _mm_and_ps(x, expmask);
121 iexp = gmx_mm_castps_si128(fexp);
122 iexp = _mm_srli_epi32(iexp, 23);
123 iexp = _mm_sub_epi32(iexp, expbase_m1);
125 x = _mm_andnot_ps(expmask, x);
126 x = _mm_or_ps(x, one);
127 x = _mm_mul_ps(x, half);
129 mask = _mm_cmplt_ps(x, invsq2);
131 x = _mm_add_ps(x, _mm_and_ps(mask, x));
132 x = _mm_sub_ps(x, one);
133 iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
135 x2 = _mm_mul_ps(x, x);
137 pA = _mm_mul_ps(CA_1, x);
138 pB = _mm_mul_ps(CB_1, x);
139 pC = _mm_mul_ps(CC_1, x);
140 pD = _mm_mul_ps(CD_1, x);
141 pE = _mm_mul_ps(CE_1, x);
142 tB = _mm_add_ps(CB_0, x2);
143 tC = _mm_add_ps(CC_0, x2);
144 tD = _mm_add_ps(CD_0, x2);
145 tE = _mm_add_ps(CE_0, x2);
146 pB = _mm_add_ps(pB, tB);
147 pC = _mm_add_ps(pC, tC);
148 pD = _mm_add_ps(pD, tD);
149 pE = _mm_add_ps(pE, tE);
151 pA = _mm_mul_ps(pA, pB);
152 pC = _mm_mul_ps(pC, pD);
153 pE = _mm_mul_ps(pE, x2);
154 pA = _mm_mul_ps(pA, pC);
155 y = _mm_mul_ps(pA, pE);
157 fexp = _mm_cvtepi32_ps(iexp);
158 y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
160 y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
161 x2 = _mm_add_ps(x, y);
163 x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
172 * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
173 * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
175 * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
177 * The largest-magnitude exponent we can represent in IEEE single-precision binary format
178 * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
179 * result to zero if the argument falls outside this range. For small numbers this is just fine, but
180 * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
181 * number instead. That would take a few extra cycles and not really help, since something is
182 * wrong if you are using single precision to work with numbers that cannot really be represented
183 * in single precision.
185 * The accuracy is at least 23 bits.
188 gmx_mm_exp2_ps(__m128 x)
190 /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
191 const __m128 arglimit = _mm_set1_ps(126.0f);
193 const __m128i expbase = _mm_set1_epi32(127);
194 const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
195 const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
196 const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
197 const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
198 const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
199 const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
200 const __m128 CA0 = _mm_set1_ps(1.0f);
210 iexppart = _mm_cvtps_epi32(x);
211 intpart = _mm_cvtepi32_ps(iexppart);
212 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
213 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
214 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
216 x = _mm_sub_ps(x, intpart);
217 x2 = _mm_mul_ps(x, x);
219 p0 = _mm_mul_ps(CA6, x2);
220 p1 = _mm_mul_ps(CA5, x2);
221 p0 = _mm_add_ps(p0, CA4);
222 p1 = _mm_add_ps(p1, CA3);
223 p0 = _mm_mul_ps(p0, x2);
224 p1 = _mm_mul_ps(p1, x2);
225 p0 = _mm_add_ps(p0, CA2);
226 p1 = _mm_add_ps(p1, CA1);
227 p0 = _mm_mul_ps(p0, x2);
228 p1 = _mm_mul_ps(p1, x);
229 p0 = _mm_add_ps(p0, CA0);
230 p0 = _mm_add_ps(p0, p1);
231 x = _mm_mul_ps(p0, fexppart);
237 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
238 * but there will then be a small rounding error since we lose some precision due to the
239 * multiplication. This will then be magnified a lot by the exponential.
241 * Instead, we calculate the fractional part directly as a minimax approximation of
242 * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
243 * remaining after 2^y, which avoids the precision-loss.
244 * The final result is correct to within 1 LSB over the entire argument range.
247 gmx_mm_exp_ps(__m128 x)
249 const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
250 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
251 const __m128 arglimit = _mm_set1_ps(126.0f);
252 const __m128i expbase = _mm_set1_epi32(127);
254 const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
255 const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
257 const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
258 const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
259 const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
260 const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
261 const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
262 const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
263 const __m128 one = _mm_set1_ps(1.0f);
272 y = _mm_mul_ps(x, argscale);
274 iexppart = _mm_cvtps_epi32(y);
275 intpart = _mm_cvtepi32_ps(iexppart);
277 iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
278 valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
279 fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
281 /* Extended precision arithmetics */
282 x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
283 x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
285 x2 = _mm_mul_ps(x, x);
287 p1 = _mm_mul_ps(CC5, x2);
288 p0 = _mm_mul_ps(CC4, x2);
289 p1 = _mm_add_ps(p1, CC3);
290 p0 = _mm_add_ps(p0, CC2);
291 p1 = _mm_mul_ps(p1, x2);
292 p0 = _mm_mul_ps(p0, x2);
293 p1 = _mm_add_ps(p1, CC1);
294 p0 = _mm_add_ps(p0, CC0);
295 p1 = _mm_mul_ps(p1, x);
296 p0 = _mm_add_ps(p0, p1);
297 p0 = _mm_mul_ps(p0, x2);
298 x = _mm_add_ps(x, one);
299 x = _mm_add_ps(x, p0);
301 x = _mm_mul_ps(x, fexppart);
306 /* FULL precision. Only errors in LSB */
308 gmx_mm_erf_ps(__m128 x)
310 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
311 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
312 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
313 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
314 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
315 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
316 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
317 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
318 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
319 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
320 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
321 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
322 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
323 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
324 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
325 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
326 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
327 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
328 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
329 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
330 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
331 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
332 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
333 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
334 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
335 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
336 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
337 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
338 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
339 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
340 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
342 /* Coefficients for expansion of exp(x) in [0,0.1] */
343 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
344 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
345 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
346 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
348 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
349 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
350 const __m128 one = _mm_set1_ps(1.0f);
351 const __m128 two = _mm_set1_ps(2.0f);
354 __m128 z, q, t, t2, w, w2;
355 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
357 __m128 res_erf, res_erfc, res;
360 /* Calculate erf() */
361 x2 = _mm_mul_ps(x, x);
362 x4 = _mm_mul_ps(x2, x2);
364 pA0 = _mm_mul_ps(CA6, x4);
365 pA1 = _mm_mul_ps(CA5, x4);
366 pA0 = _mm_add_ps(pA0, CA4);
367 pA1 = _mm_add_ps(pA1, CA3);
368 pA0 = _mm_mul_ps(pA0, x4);
369 pA1 = _mm_mul_ps(pA1, x4);
370 pA0 = _mm_add_ps(pA0, CA2);
371 pA1 = _mm_add_ps(pA1, CA1);
372 pA0 = _mm_mul_ps(pA0, x4);
373 pA1 = _mm_mul_ps(pA1, x2);
374 pA0 = _mm_add_ps(pA0, pA1);
375 pA0 = _mm_add_ps(pA0, CA0);
377 res_erf = _mm_mul_ps(x, pA0);
381 y = gmx_mm_abs_ps(x);
382 t = gmx_mm_inv_ps(y);
383 w = _mm_sub_ps(t, one);
384 t2 = _mm_mul_ps(t, t);
385 w2 = _mm_mul_ps(w, w);
387 * We cannot simply calculate exp(-x2) directly in single precision, since
388 * that will lose a couple of bits of precision due to the multiplication.
389 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
390 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
392 * The only drawback with this is that it requires TWO separate exponential
393 * evaluations, which would be horrible performance-wise. However, the argument
394 * for the second exp() call is always small, so there we simply use a
395 * low-order minimax expansion on [0,0.1].
398 z = _mm_and_ps(y, sieve);
399 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
401 corr = _mm_mul_ps(CD4, q);
402 corr = _mm_add_ps(corr, CD3);
403 corr = _mm_mul_ps(corr, q);
404 corr = _mm_add_ps(corr, CD2);
405 corr = _mm_mul_ps(corr, q);
406 corr = _mm_add_ps(corr, one);
407 corr = _mm_mul_ps(corr, q);
408 corr = _mm_add_ps(corr, one);
410 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
411 expmx2 = _mm_mul_ps(expmx2, corr);
413 pB1 = _mm_mul_ps(CB9, w2);
414 pB0 = _mm_mul_ps(CB8, w2);
415 pB1 = _mm_add_ps(pB1, CB7);
416 pB0 = _mm_add_ps(pB0, CB6);
417 pB1 = _mm_mul_ps(pB1, w2);
418 pB0 = _mm_mul_ps(pB0, w2);
419 pB1 = _mm_add_ps(pB1, CB5);
420 pB0 = _mm_add_ps(pB0, CB4);
421 pB1 = _mm_mul_ps(pB1, w2);
422 pB0 = _mm_mul_ps(pB0, w2);
423 pB1 = _mm_add_ps(pB1, CB3);
424 pB0 = _mm_add_ps(pB0, CB2);
425 pB1 = _mm_mul_ps(pB1, w2);
426 pB0 = _mm_mul_ps(pB0, w2);
427 pB1 = _mm_add_ps(pB1, CB1);
428 pB1 = _mm_mul_ps(pB1, w);
429 pB0 = _mm_add_ps(pB0, pB1);
430 pB0 = _mm_add_ps(pB0, CB0);
432 pC0 = _mm_mul_ps(CC10, t2);
433 pC1 = _mm_mul_ps(CC9, t2);
434 pC0 = _mm_add_ps(pC0, CC8);
435 pC1 = _mm_add_ps(pC1, CC7);
436 pC0 = _mm_mul_ps(pC0, t2);
437 pC1 = _mm_mul_ps(pC1, t2);
438 pC0 = _mm_add_ps(pC0, CC6);
439 pC1 = _mm_add_ps(pC1, CC5);
440 pC0 = _mm_mul_ps(pC0, t2);
441 pC1 = _mm_mul_ps(pC1, t2);
442 pC0 = _mm_add_ps(pC0, CC4);
443 pC1 = _mm_add_ps(pC1, CC3);
444 pC0 = _mm_mul_ps(pC0, t2);
445 pC1 = _mm_mul_ps(pC1, t2);
446 pC0 = _mm_add_ps(pC0, CC2);
447 pC1 = _mm_add_ps(pC1, CC1);
448 pC0 = _mm_mul_ps(pC0, t2);
449 pC1 = _mm_mul_ps(pC1, t);
450 pC0 = _mm_add_ps(pC0, pC1);
451 pC0 = _mm_add_ps(pC0, CC0);
452 pC0 = _mm_mul_ps(pC0, t);
454 /* SELECT pB0 or pC0 for erfc() */
455 mask = _mm_cmplt_ps(two, y);
456 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
457 res_erfc = _mm_mul_ps(res_erfc, expmx2);
459 /* erfc(x<0) = 2-erfc(|x|) */
460 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
461 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
462 _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
464 /* Select erf() or erfc() */
465 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
466 res = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
475 /* FULL precision. Only errors in LSB */
477 gmx_mm_erfc_ps(__m128 x)
479 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
480 const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
481 const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
482 const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
483 const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
484 const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
485 const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
486 const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
487 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
488 const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
489 const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
490 const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
491 const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
492 const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
493 const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
494 const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
495 const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
496 const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
497 const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
498 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
499 const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
500 const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
501 const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
502 const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
503 const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
504 const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
505 const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
506 const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
507 const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
508 const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
509 const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
511 /* Coefficients for expansion of exp(x) in [0,0.1] */
512 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
513 const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
514 const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
515 const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
517 const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
518 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
519 const __m128 one = _mm_set1_ps(1.0f);
520 const __m128 two = _mm_set1_ps(2.0f);
523 __m128 z, q, t, t2, w, w2;
524 __m128 pA0, pA1, pB0, pB1, pC0, pC1;
526 __m128 res_erf, res_erfc, res;
529 /* Calculate erf() */
530 x2 = _mm_mul_ps(x, x);
531 x4 = _mm_mul_ps(x2, x2);
533 pA0 = _mm_mul_ps(CA6, x4);
534 pA1 = _mm_mul_ps(CA5, x4);
535 pA0 = _mm_add_ps(pA0, CA4);
536 pA1 = _mm_add_ps(pA1, CA3);
537 pA0 = _mm_mul_ps(pA0, x4);
538 pA1 = _mm_mul_ps(pA1, x4);
539 pA0 = _mm_add_ps(pA0, CA2);
540 pA1 = _mm_add_ps(pA1, CA1);
541 pA0 = _mm_mul_ps(pA0, x4);
542 pA1 = _mm_mul_ps(pA1, x2);
543 pA0 = _mm_add_ps(pA0, pA1);
544 pA0 = _mm_add_ps(pA0, CA0);
546 res_erf = _mm_mul_ps(x, pA0);
549 y = gmx_mm_abs_ps(x);
550 t = gmx_mm_inv_ps(y);
551 w = _mm_sub_ps(t, one);
552 t2 = _mm_mul_ps(t, t);
553 w2 = _mm_mul_ps(w, w);
555 * We cannot simply calculate exp(-x2) directly in single precision, since
556 * that will lose a couple of bits of precision due to the multiplication.
557 * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
558 * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
560 * The only drawback with this is that it requires TWO separate exponential
561 * evaluations, which would be horrible performance-wise. However, the argument
562 * for the second exp() call is always small, so there we simply use a
563 * low-order minimax expansion on [0,0.1].
566 z = _mm_and_ps(y, sieve);
567 q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
569 corr = _mm_mul_ps(CD4, q);
570 corr = _mm_add_ps(corr, CD3);
571 corr = _mm_mul_ps(corr, q);
572 corr = _mm_add_ps(corr, CD2);
573 corr = _mm_mul_ps(corr, q);
574 corr = _mm_add_ps(corr, one);
575 corr = _mm_mul_ps(corr, q);
576 corr = _mm_add_ps(corr, one);
578 expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
579 expmx2 = _mm_mul_ps(expmx2, corr);
581 pB1 = _mm_mul_ps(CB9, w2);
582 pB0 = _mm_mul_ps(CB8, w2);
583 pB1 = _mm_add_ps(pB1, CB7);
584 pB0 = _mm_add_ps(pB0, CB6);
585 pB1 = _mm_mul_ps(pB1, w2);
586 pB0 = _mm_mul_ps(pB0, w2);
587 pB1 = _mm_add_ps(pB1, CB5);
588 pB0 = _mm_add_ps(pB0, CB4);
589 pB1 = _mm_mul_ps(pB1, w2);
590 pB0 = _mm_mul_ps(pB0, w2);
591 pB1 = _mm_add_ps(pB1, CB3);
592 pB0 = _mm_add_ps(pB0, CB2);
593 pB1 = _mm_mul_ps(pB1, w2);
594 pB0 = _mm_mul_ps(pB0, w2);
595 pB1 = _mm_add_ps(pB1, CB1);
596 pB1 = _mm_mul_ps(pB1, w);
597 pB0 = _mm_add_ps(pB0, pB1);
598 pB0 = _mm_add_ps(pB0, CB0);
600 pC0 = _mm_mul_ps(CC10, t2);
601 pC1 = _mm_mul_ps(CC9, t2);
602 pC0 = _mm_add_ps(pC0, CC8);
603 pC1 = _mm_add_ps(pC1, CC7);
604 pC0 = _mm_mul_ps(pC0, t2);
605 pC1 = _mm_mul_ps(pC1, t2);
606 pC0 = _mm_add_ps(pC0, CC6);
607 pC1 = _mm_add_ps(pC1, CC5);
608 pC0 = _mm_mul_ps(pC0, t2);
609 pC1 = _mm_mul_ps(pC1, t2);
610 pC0 = _mm_add_ps(pC0, CC4);
611 pC1 = _mm_add_ps(pC1, CC3);
612 pC0 = _mm_mul_ps(pC0, t2);
613 pC1 = _mm_mul_ps(pC1, t2);
614 pC0 = _mm_add_ps(pC0, CC2);
615 pC1 = _mm_add_ps(pC1, CC1);
616 pC0 = _mm_mul_ps(pC0, t2);
617 pC1 = _mm_mul_ps(pC1, t);
618 pC0 = _mm_add_ps(pC0, pC1);
619 pC0 = _mm_add_ps(pC0, CC0);
620 pC0 = _mm_mul_ps(pC0, t);
622 /* SELECT pB0 or pC0 for erfc() */
623 mask = _mm_cmplt_ps(two, y);
624 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
625 res_erfc = _mm_mul_ps(res_erfc, expmx2);
627 /* erfc(x<0) = 2-erfc(|x|) */
628 mask = _mm_cmplt_ps(x, _mm_setzero_ps());
629 res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
631 /* Select erf() or erfc() */
632 mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
633 res = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
639 /* Calculate the force correction due to PME analytically.
641 * This routine is meant to enable analytical evaluation of the
642 * direct-space PME electrostatic force to avoid tables.
644 * The direct-space potential should be Erfc(beta*r)/r, but there
645 * are some problems evaluating that:
647 * First, the error function is difficult (read: expensive) to
648 * approxmiate accurately for intermediate to large arguments, and
649 * this happens already in ranges of beta*r that occur in simulations.
650 * Second, we now try to avoid calculating potentials in Gromacs but
651 * use forces directly.
653 * We can simply things slight by noting that the PME part is really
654 * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
656 * V= 1/r - Erf(beta*r)/r
658 * The first term we already have from the inverse square root, so
659 * that we can leave out of this routine.
661 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
662 * the argument beta*r will be in the range 0.15 to ~4. Use your
663 * favorite plotting program to realize how well-behaved Erf(z)/z is
666 * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
667 * However, it turns out it is more efficient to approximate f(z)/z and
668 * then only use even powers. This is another minor optimization, since
669 * we actually WANT f(z)/z, because it is going to be multiplied by
670 * the vector between the two atoms to get the vectorial force. The
671 * fastest flops are the ones we can avoid calculating!
673 * So, here's how it should be used:
676 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
677 * 3. Evaluate this routine with z^2 as the argument.
678 * 4. The return value is the expression:
682 * ------------ - --------
685 * 5. Multiply the entire expression by beta^3. This will get you
687 * beta^3*2*exp(-z^2) beta^3*erf(z)
688 * ------------------ - ---------------
691 * or, switching back to r (z=r*beta):
693 * 2*beta*exp(-r^2*beta^2) erf(r*beta)
694 * ----------------------- - -----------
698 * With a bit of math exercise you should be able to confirm that
699 * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
701 * 6. Add the result to 1/r^3, multiply by the product of the charges,
702 * and you have your force (divided by r). A final multiplication
703 * with the vector connecting the two particles and you have your
704 * vectorial force to add to the particles.
707 static gmx_inline __m128
708 gmx_mm_pmecorrF_ps(__m128 z2)
710 const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
711 const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
712 const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
713 const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
714 const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
715 const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
716 const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
718 const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
719 const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
720 const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
721 const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
722 const __m128 FD0 = _mm_set1_ps(1.0f);
725 __m128 polyFN0, polyFN1, polyFD0, polyFD1;
727 z4 = _mm_mul_ps(z2, z2);
729 polyFD0 = _mm_mul_ps(FD4, z4);
730 polyFD1 = _mm_mul_ps(FD3, z4);
731 polyFD0 = _mm_add_ps(polyFD0, FD2);
732 polyFD1 = _mm_add_ps(polyFD1, FD1);
733 polyFD0 = _mm_mul_ps(polyFD0, z4);
734 polyFD1 = _mm_mul_ps(polyFD1, z2);
735 polyFD0 = _mm_add_ps(polyFD0, FD0);
736 polyFD0 = _mm_add_ps(polyFD0, polyFD1);
738 polyFD0 = gmx_mm_inv_ps(polyFD0);
740 polyFN0 = _mm_mul_ps(FN6, z4);
741 polyFN1 = _mm_mul_ps(FN5, z4);
742 polyFN0 = _mm_add_ps(polyFN0, FN4);
743 polyFN1 = _mm_add_ps(polyFN1, FN3);
744 polyFN0 = _mm_mul_ps(polyFN0, z4);
745 polyFN1 = _mm_mul_ps(polyFN1, z4);
746 polyFN0 = _mm_add_ps(polyFN0, FN2);
747 polyFN1 = _mm_add_ps(polyFN1, FN1);
748 polyFN0 = _mm_mul_ps(polyFN0, z4);
749 polyFN1 = _mm_mul_ps(polyFN1, z2);
750 polyFN0 = _mm_add_ps(polyFN0, FN0);
751 polyFN0 = _mm_add_ps(polyFN0, polyFN1);
753 return _mm_mul_ps(polyFN0, polyFD0);
757 /* Calculate the potential correction due to PME analytically.
759 * See gmx_mm256_pmecorrF_ps() for details about the approximation.
761 * This routine calculates Erf(z)/z, although you should provide z^2
762 * as the input argument.
764 * Here's how it should be used:
767 * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
768 * 3. Evaluate this routine with z^2 as the argument.
769 * 4. The return value is the expression:
776 * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
782 * 6. Subtract the result from 1/r, multiply by the product of the charges,
783 * and you have your potential.
785 static gmx_inline __m128
786 gmx_mm_pmecorrV_ps(__m128 z2)
788 const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
789 const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
790 const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
791 const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
792 const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
793 const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
794 const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
796 const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
797 const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
798 const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
799 const __m128 VD0 = _mm_set1_ps(1.0f);
802 __m128 polyVN0, polyVN1, polyVD0, polyVD1;
804 z4 = _mm_mul_ps(z2, z2);
806 polyVD1 = _mm_mul_ps(VD3, z4);
807 polyVD0 = _mm_mul_ps(VD2, z4);
808 polyVD1 = _mm_add_ps(polyVD1, VD1);
809 polyVD0 = _mm_add_ps(polyVD0, VD0);
810 polyVD1 = _mm_mul_ps(polyVD1, z2);
811 polyVD0 = _mm_add_ps(polyVD0, polyVD1);
813 polyVD0 = gmx_mm_inv_ps(polyVD0);
815 polyVN0 = _mm_mul_ps(VN6, z4);
816 polyVN1 = _mm_mul_ps(VN5, z4);
817 polyVN0 = _mm_add_ps(polyVN0, VN4);
818 polyVN1 = _mm_add_ps(polyVN1, VN3);
819 polyVN0 = _mm_mul_ps(polyVN0, z4);
820 polyVN1 = _mm_mul_ps(polyVN1, z4);
821 polyVN0 = _mm_add_ps(polyVN0, VN2);
822 polyVN1 = _mm_add_ps(polyVN1, VN1);
823 polyVN0 = _mm_mul_ps(polyVN0, z4);
824 polyVN1 = _mm_mul_ps(polyVN1, z2);
825 polyVN0 = _mm_add_ps(polyVN0, VN0);
826 polyVN0 = _mm_add_ps(polyVN0, polyVN1);
828 return _mm_mul_ps(polyVN0, polyVD0);
833 gmx_mm_sincos_ps(__m128 x,
837 const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
838 const __m128 half = _mm_set1_ps(0.5);
839 const __m128 one = _mm_set1_ps(1.0);
841 const __m128i izero = _mm_set1_epi32(0);
842 const __m128i ione = _mm_set1_epi32(1);
843 const __m128i itwo = _mm_set1_epi32(2);
844 const __m128i ithree = _mm_set1_epi32(3);
845 const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
847 const __m128 CA1 = _mm_set1_ps(1.5703125f);
848 const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
849 const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
851 const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
852 const __m128 CC1 = _mm_set1_ps(0.0416566950f);
853 const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
854 const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
855 const __m128 CS1 = _mm_set1_ps(0.0083320758f);
856 const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
861 __m128i offset_sin, offset_cos;
863 __m128 mask_sin, mask_cos;
864 __m128 tmp_sin, tmp_cos;
866 y = _mm_mul_ps(x, two_over_pi);
867 y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
869 iz = _mm_cvttps_epi32(y);
870 z = _mm_cvtepi32_ps(iz);
872 offset_sin = _mm_and_si128(iz, ithree);
873 offset_cos = _mm_add_epi32(iz, ione);
875 /* Extended precision arithmethic to achieve full precision */
876 y = _mm_mul_ps(z, CA1);
877 tmp1 = _mm_mul_ps(z, CA2);
878 tmp2 = _mm_mul_ps(z, CA3);
879 y = _mm_sub_ps(x, y);
880 y = _mm_sub_ps(y, tmp1);
881 y = _mm_sub_ps(y, tmp2);
883 y2 = _mm_mul_ps(y, y);
885 tmp1 = _mm_mul_ps(CC0, y2);
886 tmp1 = _mm_add_ps(tmp1, CC1);
887 tmp2 = _mm_mul_ps(CS0, y2);
888 tmp2 = _mm_add_ps(tmp2, CS1);
889 tmp1 = _mm_mul_ps(tmp1, y2);
890 tmp1 = _mm_add_ps(tmp1, CC2);
891 tmp2 = _mm_mul_ps(tmp2, y2);
892 tmp2 = _mm_add_ps(tmp2, CS2);
894 tmp1 = _mm_mul_ps(tmp1, y2);
895 tmp1 = _mm_add_ps(tmp1, one);
897 tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
898 tmp2 = _mm_add_ps(tmp2, y);
900 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
901 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
903 tmp_sin = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
904 tmp_cos = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
906 mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
907 mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
909 tmp1 = _mm_xor_ps(signbit, tmp_sin);
910 tmp2 = _mm_xor_ps(signbit, tmp_cos);
912 *sinval = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
913 *cosval = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
919 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
920 * will then call the sincos() routine and waste a factor 2 in performance!
923 gmx_mm_sin_ps(__m128 x)
926 gmx_mm_sincos_ps(x, &s, &c);
931 * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
932 * will then call the sincos() routine and waste a factor 2 in performance!
935 gmx_mm_cos_ps(__m128 x)
938 gmx_mm_sincos_ps(x, &s, &c);
944 gmx_mm_tan_ps(__m128 x)
946 __m128 sinval, cosval;
949 gmx_mm_sincos_ps(x, &sinval, &cosval);
951 tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
958 gmx_mm_asin_ps(__m128 x)
960 /* Same algorithm as cephes library */
961 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
962 const __m128 limitlow = _mm_set1_ps(1e-4f);
963 const __m128 half = _mm_set1_ps(0.5f);
964 const __m128 one = _mm_set1_ps(1.0f);
965 const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
967 const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
968 const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
969 const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
970 const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
971 const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
976 __m128 z, z1, z2, q, q1, q2;
979 sign = _mm_andnot_ps(signmask, x);
980 xabs = _mm_and_ps(x, signmask);
982 mask = _mm_cmpgt_ps(xabs, half);
984 z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
985 q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
986 q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
989 z2 = _mm_mul_ps(q2, q2);
991 z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
992 q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
994 z2 = _mm_mul_ps(z, z);
996 pA = _mm_mul_ps(CC5, z2);
997 pB = _mm_mul_ps(CC4, z2);
999 pA = _mm_add_ps(pA, CC3);
1000 pB = _mm_add_ps(pB, CC2);
1002 pA = _mm_mul_ps(pA, z2);
1003 pB = _mm_mul_ps(pB, z2);
1005 pA = _mm_add_ps(pA, CC1);
1006 pA = _mm_mul_ps(pA, z);
1008 z = _mm_add_ps(pA, pB);
1009 z = _mm_mul_ps(z, q);
1010 z = _mm_add_ps(z, q);
1012 q2 = _mm_sub_ps(halfpi, z);
1013 q2 = _mm_sub_ps(q2, z);
1015 z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1017 mask = _mm_cmpgt_ps(xabs, limitlow);
1018 z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1020 z = _mm_xor_ps(z, sign);
1027 gmx_mm_acos_ps(__m128 x)
1029 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1030 const __m128 one_ps = _mm_set1_ps(1.0f);
1031 const __m128 half_ps = _mm_set1_ps(0.5f);
1032 const __m128 pi_ps = _mm_set1_ps(M_PI);
1033 const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1038 __m128 z, z1, z2, z3;
1040 xabs = _mm_and_ps(x, signmask);
1041 mask1 = _mm_cmpgt_ps(xabs, half_ps);
1042 mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1044 z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1045 z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1046 z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1048 z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
1049 z = gmx_mm_asin_ps(z);
1051 z2 = _mm_add_ps(z, z);
1052 z1 = _mm_sub_ps(pi_ps, z2);
1053 z3 = _mm_sub_ps(halfpi_ps, z);
1055 z = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
1056 z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
1063 gmx_mm_atan_ps(__m128 x)
1065 /* Same algorithm as cephes library */
1066 const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1067 const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
1068 const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
1069 const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1070 const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
1071 const __m128 mone = _mm_set1_ps(-1.0f);
1072 const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
1073 const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
1074 const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
1075 const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
1078 __m128 mask1, mask2;
1083 sign = _mm_andnot_ps(signmask, x);
1084 x = _mm_and_ps(x, signmask);
1086 mask1 = _mm_cmpgt_ps(x, limit1);
1087 mask2 = _mm_cmpgt_ps(x, limit2);
1089 z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1090 z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1092 y = _mm_and_ps(mask1, quarterpi);
1093 y = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
1095 x = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
1096 x = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
1098 x2 = _mm_mul_ps(x, x);
1099 x4 = _mm_mul_ps(x2, x2);
1101 sum1 = _mm_mul_ps(CC9, x4);
1102 sum2 = _mm_mul_ps(CC7, x4);
1103 sum1 = _mm_add_ps(sum1, CC5);
1104 sum2 = _mm_add_ps(sum2, CC3);
1105 sum1 = _mm_mul_ps(sum1, x4);
1106 sum2 = _mm_mul_ps(sum2, x2);
1108 sum1 = _mm_add_ps(sum1, sum2);
1109 sum1 = _mm_sub_ps(sum1, mone);
1110 sum1 = _mm_mul_ps(sum1, x);
1111 y = _mm_add_ps(y, sum1);
1113 y = _mm_xor_ps(y, sign);
1120 gmx_mm_atan2_ps(__m128 y, __m128 x)
1122 const __m128 pi = _mm_set1_ps(M_PI);
1123 const __m128 minuspi = _mm_set1_ps(-M_PI);
1124 const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
1125 const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1127 __m128 z, z1, z3, z4;
1129 __m128 maskx_lt, maskx_eq;
1130 __m128 masky_lt, masky_eq;
1131 __m128 mask1, mask2, mask3, mask4, maskall;
1133 maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
1134 masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
1135 maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
1136 masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
1138 z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1139 z = gmx_mm_atan_ps(z);
1141 mask1 = _mm_and_ps(maskx_eq, masky_lt);
1142 mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
1143 mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1144 mask4 = _mm_and_ps(masky_eq, maskx_lt);
1146 maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1148 z = _mm_andnot_ps(maskall, z);
1149 z1 = _mm_and_ps(mask1, minushalfpi);
1150 z3 = _mm_and_ps(mask3, halfpi);
1151 z4 = _mm_and_ps(mask4, pi);
1153 z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1155 mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
1156 mask2 = _mm_and_ps(maskx_lt, masky_lt);
1158 w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1159 w = _mm_andnot_ps(maskall, w);
1161 z = _mm_add_ps(z, w);
1167 #endif /* _gmx_math_x86_sse2_single_h_ */