d9c61a46dbc24d79764dd4459b5fa6ec77460b70
[alexxy/gromacs.git] / include / gmx_math_x86_avx_128_fma_single.h
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of GROMACS.
5  * Copyright (c) 2012-  
6  *
7  * Written by the Gromacs development team under coordination of
8  * David van der Spoel, Berk Hess, and Erik Lindahl.
9  *
10  * This library is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2
13  * of the License, or (at your option) any later version.
14  *
15  * To help us fund GROMACS development, we humbly ask that you cite
16  * the research papers on the package. Check out http://www.gromacs.org
17  * 
18  * And Hey:
19  * Gnomes, ROck Monsters And Chili Sauce
20  */
21 #ifndef _gmx_math_x86_avx_128_fma_single_h_
22 #define _gmx_math_x86_avx_128_fma_single_h_
23
24 #include <immintrin.h> /* AVX */
25 #ifdef HAVE_X86INTRIN_H
26 #include <x86intrin.h> /* FMA */
27 #endif
28
29 #include <math.h>
30
31 #include "gmx_x86_avx_128_fma.h"
32
33
34 #ifndef M_PI
35 #  define M_PI 3.14159265358979323846264338327950288
36 #endif
37
38
39
40
41 /************************
42  *                      *
43  * Simple math routines *
44  *                      *
45  ************************/
46
47 /* 1.0/sqrt(x) */
48 static gmx_inline __m128
49 gmx_mm_invsqrt_ps(__m128 x)
50 {
51     const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
52     const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
53
54     __m128 lu = _mm_rsqrt_ps(x);
55
56     return _mm_mul_ps(_mm_mul_ps(half,lu),_mm_nmacc_ps(_mm_mul_ps(lu,lu),x,three));
57 }
58
59 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
60 static gmx_inline __m128
61 gmx_mm_sqrt_ps(__m128 x)
62 {
63     __m128 mask;
64     __m128 res;
65
66     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
67     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
68
69     res  = _mm_mul_ps(x,res);
70
71     return res;
72 }
73
74 /* 1.0/x */
75 static gmx_inline __m128
76 gmx_mm_inv_ps(__m128 x)
77 {
78     const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
79
80     __m128 lu = _mm_rcp_ps(x);
81
82     return _mm_mul_ps(lu,_mm_nmacc_ps(lu,x,two));
83 }
84
85 static gmx_inline __m128
86 gmx_mm_abs_ps(__m128 x)
87 {
88     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
89
90     return _mm_and_ps(x,signmask);
91 }
92
93 static __m128
94 gmx_mm_log_ps(__m128 x)
95 {
96     /* Same algorithm as cephes library */
97     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
98     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
99     const __m128  half       = _mm_set1_ps(0.5f);
100     const __m128  one        = _mm_set1_ps(1.0f);
101     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
102     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
103     const __m128  corr2      = _mm_set1_ps(0.693359375f);
104
105     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
106     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
107     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
108     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
109     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
110     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
111     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
112     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
113     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
114
115     __m128  fexp,fexp1;
116     __m128i iexp;
117     __m128  mask;
118     __m128  x1,x2;
119     __m128  y;
120     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
121
122     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
123     fexp  = _mm_and_ps(x,expmask);
124     iexp  = gmx_mm_castps_si128(fexp);
125     iexp  = _mm_srli_epi32(iexp,23);
126     iexp  = _mm_sub_epi32(iexp,expbase_m1);
127
128     x     = _mm_andnot_ps(expmask,x);
129     x     = _mm_or_ps(x,one);
130     x     = _mm_mul_ps(x,half);
131
132     mask  = _mm_cmp_ps(x,invsq2,_CMP_LT_OQ);
133
134     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
135     x     = _mm_sub_ps(x,one);
136     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
137
138     x2    = _mm_mul_ps(x,x);
139
140     pA    = _mm_mul_ps(CA_1,x);
141
142     pB    = _mm_add_ps(x,CB_1);
143     pC    = _mm_add_ps(x,CC_1);
144     pD    = _mm_add_ps(x,CD_1);
145     pE    = _mm_add_ps(x,CE_1);
146
147     pB    = _mm_macc_ps(x,pB,CB_0);
148     pC    = _mm_macc_ps(x,pC,CC_0);
149     pD    = _mm_macc_ps(x,pD,CD_0);
150     pE    = _mm_macc_ps(x,pE,CE_0);
151
152     pA    = _mm_mul_ps(pA,pB);
153     pC    = _mm_mul_ps(pC,pD);
154     pE    = _mm_mul_ps(pE,x2);
155     pA    = _mm_mul_ps(pA,pC);
156     y     = _mm_mul_ps(pA,pE);
157
158     fexp  = _mm_cvtepi32_ps(iexp);
159     y     = _mm_macc_ps(fexp,corr1,y);
160     y     = _mm_nmacc_ps(half,x2,y);
161
162     x2    = _mm_add_ps(x,y);
163     x2    = _mm_macc_ps(fexp,corr2,x2);
164
165     return x2;
166 }
167
168
169 /*
170  * 2^x function.
171  *
172  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
173  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
174  *
175  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
176  *
177  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
178  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
179  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
180  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
181  * number instead. That would take a few extra cycles and not really help, since something is
182  * wrong if you are using single precision to work with numbers that cannot really be represented
183  * in single precision.
184  *
185  * The accuracy is at least 23 bits.
186  */
187 static __m128
188 gmx_mm_exp2_ps(__m128 x)
189 {
190     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
191     const __m128  arglimit = _mm_set1_ps(126.0f);
192
193     const __m128i expbase  = _mm_set1_epi32(127);
194     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
195     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
196     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
197     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
198     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
199     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
200     const __m128  CA0      = _mm_set1_ps(1.0f);
201
202     __m128  valuemask;
203     __m128i iexppart;
204     __m128  fexppart;
205     __m128  intpart;
206     __m128  x2;
207     __m128  p0,p1;
208
209     iexppart  = _mm_cvtps_epi32(x);
210     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
211     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
212     valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(x),_CMP_GE_OQ);
213     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
214
215     x         = _mm_sub_ps(x,intpart);
216     x2        = _mm_mul_ps(x,x);
217
218     p0        = _mm_macc_ps(CA6,x2,CA4);
219     p1        = _mm_macc_ps(CA5,x2,CA3);
220     p0        = _mm_macc_ps(p0,x2,CA2);
221     p1        = _mm_macc_ps(p1,x2,CA1);
222     p0        = _mm_macc_ps(p0,x2,CA0);
223     p0        = _mm_macc_ps(p1,x,p0);
224     x         = _mm_mul_ps(p0,fexppart);
225
226     return  x;
227 }
228
229
230 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
231  * but there will then be a small rounding error since we lose some precision due to the
232  * multiplication. This will then be magnified a lot by the exponential.
233  *
234  * Instead, we calculate the fractional part directly as a minimax approximation of
235  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
236  * remaining after 2^y, which avoids the precision-loss.
237  * The final result is correct to within 1 LSB over the entire argument range.
238  */
239 static __m128
240 gmx_mm_exp_ps(__m128 x)
241 {
242     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
243     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
244     const __m128  arglimit      = _mm_set1_ps(126.0f);
245     const __m128i expbase       = _mm_set1_epi32(127);
246
247     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
248     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
249
250     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
251     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
252     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
253     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
254     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
255     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
256     const __m128  one           = _mm_set1_ps(1.0f);
257
258     __m128  y,x2;
259     __m128  p0,p1;
260     __m128  valuemask;
261     __m128i iexppart;
262     __m128  fexppart;
263     __m128  intpart;
264
265     y = _mm_mul_ps(x,argscale);
266
267     iexppart  = _mm_cvtps_epi32(y);
268     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
269
270     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
271     valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(y),_CMP_GE_OQ);
272     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
273
274     /* Extended precision arithmetics */
275     x         = _mm_nmacc_ps(invargscale0,intpart,x);
276     x         = _mm_nmacc_ps(invargscale1,intpart,x);
277
278     x2        = _mm_mul_ps(x,x);
279
280     p1        = _mm_macc_ps(CC5,x2,CC3);
281     p0        = _mm_macc_ps(CC4,x2,CC2);
282     p1        = _mm_macc_ps(p1,x2,CC1);
283     p0        = _mm_macc_ps(p0,x2,CC0);
284     p0        = _mm_macc_ps(p1,x,p0);
285     p0        = _mm_macc_ps(p0,x2,one);
286
287     x         = _mm_add_ps(x,p0);
288
289     x         = _mm_mul_ps(x,fexppart);
290
291     return x;
292 }
293
294 /* FULL precision. Only errors in LSB */
295 static __m128
296 gmx_mm_erf_ps(__m128 x)
297 {
298     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
299     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
300     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
301     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
302     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
303     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
304     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
305     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
306     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
307     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
308     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
309     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
310     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
311     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
312     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
313     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
314     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
315     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
316     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
317     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
318     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
319     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
320     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
321     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
322     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
323     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
324     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
325     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
326     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
327     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
328     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
329
330     /* Coefficients for expansion of exp(x) in [0,0.1] */
331     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
332     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
333     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
334     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
335
336     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
337     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
338     const __m128  one      = _mm_set1_ps(1.0f);
339     const __m128  two      = _mm_set1_ps(2.0f);
340
341     __m128 x2,x4,y;
342     __m128 z,q,t,t2,w,w2;
343     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
344     __m128 expmx2,corr;
345     __m128 res_erf,res_erfc,res;
346     __m128 mask;
347
348     /* Calculate erf() */
349     x2     = _mm_mul_ps(x,x);
350     x4     = _mm_mul_ps(x2,x2);
351
352     pA0  = _mm_macc_ps(CA6,x4,CA4);
353     pA1  = _mm_macc_ps(CA5,x4,CA3);
354     pA0  = _mm_macc_ps(pA0,x4,CA2);
355     pA1  = _mm_macc_ps(pA1,x4,CA1);
356     pA0  = _mm_mul_ps(pA0,x4);
357     pA0  = _mm_macc_ps(pA1,x2,pA0);
358     /* Constant term must come last for precision reasons */
359     pA0  = _mm_add_ps(pA0,CA0);
360
361     res_erf = _mm_mul_ps(x,pA0);
362
363     /* Calculate erfc */
364
365     y       = gmx_mm_abs_ps(x);
366     t       = gmx_mm_inv_ps(y);
367     w       = _mm_sub_ps(t,one);
368     t2      = _mm_mul_ps(t,t);
369     w2      = _mm_mul_ps(w,w);
370     /*
371      * We cannot simply calculate exp(-x2) directly in single precision, since
372      * that will lose a couple of bits of precision due to the multiplication.
373      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
374      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
375      *
376      * The only drawback with this is that it requires TWO separate exponential
377      * evaluations, which would be horrible performance-wise. However, the argument
378      * for the second exp() call is always small, so there we simply use a
379      * low-order minimax expansion on [0,0.1].
380      */
381
382     z       = _mm_and_ps(y,sieve);
383     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
384
385     corr    = _mm_macc_ps(CD4,q,CD3);
386     corr    = _mm_macc_ps(corr,q,CD2);
387     corr    = _mm_macc_ps(corr,q,one);
388     corr    = _mm_macc_ps(corr,q,one);
389
390     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
391     expmx2  = _mm_mul_ps(expmx2,corr);
392
393     pB1  = _mm_macc_ps(CB9,w2,CB7);
394     pB0  = _mm_macc_ps(CB8,w2,CB6);
395     pB1  = _mm_macc_ps(pB1,w2,CB5);
396     pB0  = _mm_macc_ps(pB0,w2,CB4);
397     pB1  = _mm_macc_ps(pB1,w2,CB3);
398     pB0  = _mm_macc_ps(pB0,w2,CB2);
399     pB1  = _mm_macc_ps(pB1,w2,CB1);
400     pB0  = _mm_macc_ps(pB0,w2,CB0);
401     pB0  = _mm_macc_ps(pB1,w,pB0);
402
403     pC0  = _mm_macc_ps(CC10,t2,CC8);
404     pC1  = _mm_macc_ps(CC9,t2,CC7);
405     pC0  = _mm_macc_ps(pC0,t2,CC6);
406     pC1  = _mm_macc_ps(pC1,t2,CC5);
407     pC0  = _mm_macc_ps(pC0,t2,CC4);
408     pC1  = _mm_macc_ps(pC1,t2,CC3);
409     pC0  = _mm_macc_ps(pC0,t2,CC2);
410     pC1  = _mm_macc_ps(pC1,t2,CC1);
411
412     pC0  = _mm_macc_ps(pC0,t2,CC0);
413     pC0  = _mm_macc_ps(pC1,t,pC0);
414     pC0  = _mm_mul_ps(pC0,t);
415
416     /* SELECT pB0 or pC0 for erfc() */
417     mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
418     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
419     res_erfc = _mm_mul_ps(res_erfc,expmx2);
420
421     /* erfc(x<0) = 2-erfc(|x|) */
422     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
423     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
424
425     /* Select erf() or erfc() */
426     mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
427     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
428
429     return res;
430 }
431
432
433 /* FULL precision. Only errors in LSB */
434 static __m128
435 gmx_mm_erfc_ps(__m128 x)
436 {
437     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
438     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
439     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
440     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
441     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
442     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
443     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
444     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
445     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
446     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
447     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
448     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
449     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
450     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
451     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
452     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
453     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
454     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
455     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
456     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
457     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
458     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
459     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
460     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
461     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
462     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
463     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
464     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
465     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
466     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
467     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
468
469     /* Coefficients for expansion of exp(x) in [0,0.1] */
470     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
471     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
472     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
473     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
474
475     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
476     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
477     const __m128  one      = _mm_set1_ps(1.0f);
478     const __m128  two      = _mm_set1_ps(2.0f);
479
480     __m128 x2,x4,y;
481     __m128 z,q,t,t2,w,w2;
482     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
483     __m128 expmx2,corr;
484     __m128 res_erf,res_erfc,res;
485     __m128 mask;
486
487     /* Calculate erf() */
488     x2     = _mm_mul_ps(x,x);
489     x4     = _mm_mul_ps(x2,x2);
490
491     pA0  = _mm_macc_ps(CA6,x4,CA4);
492     pA1  = _mm_macc_ps(CA5,x4,CA3);
493     pA0  = _mm_macc_ps(pA0,x4,CA2);
494     pA1  = _mm_macc_ps(pA1,x4,CA1);
495     pA1  = _mm_mul_ps(pA1,x2);
496     pA0  = _mm_macc_ps(pA0,x4,pA1);
497     /* Constant term must come last for precision reasons */
498     pA0  = _mm_add_ps(pA0,CA0);
499
500     res_erf = _mm_mul_ps(x,pA0);
501
502     /* Calculate erfc */
503     y       = gmx_mm_abs_ps(x);
504     t       = gmx_mm_inv_ps(y);
505     w       = _mm_sub_ps(t,one);
506     t2      = _mm_mul_ps(t,t);
507     w2      = _mm_mul_ps(w,w);
508     /*
509      * We cannot simply calculate exp(-x2) directly in single precision, since
510      * that will lose a couple of bits of precision due to the multiplication.
511      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
512      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
513      *
514      * The only drawback with this is that it requires TWO separate exponential
515      * evaluations, which would be horrible performance-wise. However, the argument
516      * for the second exp() call is always small, so there we simply use a
517      * low-order minimax expansion on [0,0.1].
518      */
519
520     z       = _mm_and_ps(y,sieve);
521     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
522
523     corr    = _mm_macc_ps(CD4,q,CD3);
524     corr    = _mm_macc_ps(corr,q,CD2);
525     corr    = _mm_macc_ps(corr,q,one);
526     corr    = _mm_macc_ps(corr,q,one);
527
528     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
529     expmx2  = _mm_mul_ps(expmx2,corr);
530
531     pB1  = _mm_macc_ps(CB9,w2,CB7);
532     pB0  = _mm_macc_ps(CB8,w2,CB6);
533     pB1  = _mm_macc_ps(pB1,w2,CB5);
534     pB0  = _mm_macc_ps(pB0,w2,CB4);
535     pB1  = _mm_macc_ps(pB1,w2,CB3);
536     pB0  = _mm_macc_ps(pB0,w2,CB2);
537     pB1  = _mm_macc_ps(pB1,w2,CB1);
538     pB0  = _mm_macc_ps(pB0,w2,CB0);
539     pB0  = _mm_macc_ps(pB1,w,pB0);
540
541     pC0  = _mm_macc_ps(CC10,t2,CC8);
542     pC1  = _mm_macc_ps(CC9,t2,CC7);
543     pC0  = _mm_macc_ps(pC0,t2,CC6);
544     pC1  = _mm_macc_ps(pC1,t2,CC5);
545     pC0  = _mm_macc_ps(pC0,t2,CC4);
546     pC1  = _mm_macc_ps(pC1,t2,CC3);
547     pC0  = _mm_macc_ps(pC0,t2,CC2);
548     pC1  = _mm_macc_ps(pC1,t2,CC1);
549
550     pC0  = _mm_macc_ps(pC0,t2,CC0);
551     pC0  = _mm_macc_ps(pC1,t,pC0);
552     pC0  = _mm_mul_ps(pC0,t);
553
554     /* SELECT pB0 or pC0 for erfc() */
555     mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
556     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
557     res_erfc = _mm_mul_ps(res_erfc,expmx2);
558
559     /* erfc(x<0) = 2-erfc(|x|) */
560     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
561     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
562
563     /* Select erf() or erfc() */
564     mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
565     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
566
567     return res;
568 }
569
570
571 /* Calculate the force correction due to PME analytically.
572  *
573  * This routine is meant to enable analytical evaluation of the
574  * direct-space PME electrostatic force to avoid tables. 
575  *
576  * The direct-space potential should be Erfc(beta*r)/r, but there
577  * are some problems evaluating that: 
578  *
579  * First, the error function is difficult (read: expensive) to 
580  * approxmiate accurately for intermediate to large arguments, and
581  * this happens already in ranges of beta*r that occur in simulations.
582  * Second, we now try to avoid calculating potentials in Gromacs but
583  * use forces directly.
584  *
585  * We can simply things slight by noting that the PME part is really
586  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
587  *
588  * V= 1/r - Erf(beta*r)/r
589  *
590  * The first term we already have from the inverse square root, so
591  * that we can leave out of this routine. 
592  *
593  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
594  * the argument beta*r will be in the range 0.15 to ~4. Use your 
595  * favorite plotting program to realize how well-behaved Erf(z)/z is
596  * in this range! 
597  *
598  * We approximate f(z)=erf(z)/z with a rational minimax polynomial. 
599  * However, it turns out it is more efficient to approximate f(z)/z and
600  * then only use even powers. This is another minor optimization, since
601  * we actually WANT f(z)/z, because it is going to be multiplied by
602  * the vector between the two atoms to get the vectorial force. The 
603  * fastest flops are the ones we can avoid calculating!
604  * 
605  * So, here's how it should be used:
606  *
607  * 1. Calculate r^2.
608  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
609  * 3. Evaluate this routine with z^2 as the argument.
610  * 4. The return value is the expression:
611  *
612  *  
613  *       2*exp(-z^2)     erf(z)
614  *       ------------ - --------
615  *       sqrt(Pi)*z^2      z^3
616  * 
617  * 5. Multiply the entire expression by beta^3. This will get you
618  *
619  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
620  *       ------------------  - ---------------
621  *          sqrt(Pi)*z^2            z^3
622  * 
623  *    or, switching back to r (z=r*beta):
624  *
625  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
626  *       ----------------------- - -----------
627  *            sqrt(Pi)*r^2            r^3
628  *
629  *
630  *    With a bit of math exercise you should be able to confirm that
631  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
632  *
633  * 6. Add the result to 1/r^3, multiply by the product of the charges,
634  *    and you have your force (divided by r). A final multiplication
635  *    with the vector connecting the two particles and you have your
636  *    vectorial force to add to the particles.
637  *
638  */
639 __m128
640 gmx_mm_pmecorrF_ps(__m128 z2)
641 {
642     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
643     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
644     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
645     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
646     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
647     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
648     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
649     
650     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
651     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
652     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
653     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
654     const __m128  FD0      = _mm_set1_ps(1.0f);
655     
656     __m128 z4;
657     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
658     
659     z4             = _mm_mul_ps(z2,z2);
660     
661     polyFD0        = _mm_macc_ps(FD4,z4,FD2);
662     polyFD1        = _mm_macc_ps(FD3,z4,FD1);
663     polyFD0        = _mm_macc_ps(polyFD0,z4,FD0);
664     polyFD0        = _mm_macc_ps(polyFD1,z2,polyFD0);
665     
666     polyFD0        = gmx_mm_inv_ps(polyFD0);
667     
668     polyFN0        = _mm_macc_ps(FN6,z4,FN4);
669     polyFN1        = _mm_macc_ps(FN5,z4,FN3);
670     polyFN0        = _mm_macc_ps(polyFN0,z4,FN2);
671     polyFN1        = _mm_macc_ps(polyFN1,z4,FN1);
672     polyFN0        = _mm_macc_ps(polyFN0,z4,FN0);
673     polyFN0        = _mm_macc_ps(polyFN1,z2,polyFN0);
674     
675     return   _mm_mul_ps(polyFN0,polyFD0);
676 }
677
678
679
680
681 /* Calculate the potential correction due to PME analytically.
682  *
683  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
684  *
685  * This routine calculates Erf(z)/z, although you should provide z^2
686  * as the input argument.
687  *
688  * Here's how it should be used:
689  *
690  * 1. Calculate r^2.
691  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
692  * 3. Evaluate this routine with z^2 as the argument.
693  * 4. The return value is the expression:
694  *
695  *  
696  *        erf(z)
697  *       --------
698  *          z
699  * 
700  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
701  *
702  *       erf(r*beta)
703  *       -----------
704  *           r 
705  *
706  * 6. Add the result to 1/r, multiply by the product of the charges,
707  *    and you have your potential.
708  */
709 __m128
710 gmx_mm_pmecorrV_ps(__m128 z2)
711 {
712     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
713     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
714     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
715     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
716     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
717     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
718     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
719     
720     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
721     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
722     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
723     const __m128  VD0      = _mm_set1_ps(1.0f);
724     
725     __m128 z4;
726     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
727     
728     z4             = _mm_mul_ps(z2,z2);
729     
730     polyVD1        = _mm_macc_ps(VD3,z4,VD1);
731     polyVD0        = _mm_macc_ps(VD2,z4,VD0);
732     polyVD0        = _mm_macc_ps(polyVD1,z2,polyVD0);
733     
734     polyVD0        = gmx_mm_inv_ps(polyVD0);
735     
736     polyVN0        = _mm_macc_ps(VN6,z4,VN4);
737     polyVN1        = _mm_macc_ps(VN5,z4,VN3);    
738     polyVN0        = _mm_macc_ps(polyVN0,z4,VN2);
739     polyVN1        = _mm_macc_ps(polyVN1,z4,VN1);
740     polyVN0        = _mm_macc_ps(polyVN0,z4,VN0);
741     polyVN0        = _mm_macc_ps(polyVN1,z2,polyVN0);
742     
743     return   _mm_mul_ps(polyVN0,polyVD0);
744 }
745
746
747
748 static int
749 gmx_mm_sincos_ps(__m128 x,
750                  __m128 *sinval,
751                  __m128 *cosval)
752 {
753     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
754     const __m128 half        = _mm_set1_ps(0.5);
755     const __m128 one         = _mm_set1_ps(1.0);
756
757     const __m128i izero      = _mm_set1_epi32(0);
758     const __m128i ione       = _mm_set1_epi32(1);
759     const __m128i itwo       = _mm_set1_epi32(2);
760     const __m128i ithree     = _mm_set1_epi32(3);
761     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
762
763     const __m128 CA1         = _mm_set1_ps(1.5703125f);
764     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
765     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
766
767     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
768     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
769     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
770     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
771     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
772     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
773
774     __m128  y,y2;
775     __m128  z;
776     __m128i iz;
777     __m128i offset_sin,offset_cos;
778     __m128  tmp1,tmp2;
779     __m128  mask_sin,mask_cos;
780     __m128  tmp_sin,tmp_cos;
781
782     y          = _mm_mul_ps(x,two_over_pi);
783     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
784
785     iz         = _mm_cvttps_epi32(y);
786     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
787
788     offset_sin = _mm_and_si128(iz,ithree);
789     offset_cos = _mm_add_epi32(iz,ione);
790
791     /* Extended precision arithmethic to achieve full precision */
792     y               = _mm_nmacc_ps(z,CA1,x);
793     y               = _mm_nmacc_ps(z,CA2,y);
794     y               = _mm_nmacc_ps(z,CA3,y);
795
796     y2              = _mm_mul_ps(y,y);
797
798     tmp1            = _mm_macc_ps(CC0,y2,CC1);
799     tmp2            = _mm_macc_ps(CS0,y2,CS1);
800     tmp1            = _mm_macc_ps(tmp1,y2,CC2);
801     tmp2            = _mm_macc_ps(tmp2,y2,CS2);
802
803     tmp1            = _mm_macc_ps(tmp1,y2,one);
804
805     tmp2            = _mm_macc_ps(tmp2,_mm_mul_ps(y,y2),y);
806
807     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
808     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
809
810     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
811     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
812
813     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
814     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
815
816     tmp1            = _mm_xor_ps(signbit,tmp_sin);
817     tmp2            = _mm_xor_ps(signbit,tmp_cos);
818
819     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
820     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
821
822     return 0;
823 }
824
825 /*
826  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
827  * will then call the sincos() routine and waste a factor 2 in performance!
828  */
829 static __m128
830 gmx_mm_sin_ps(__m128 x)
831 {
832     __m128 s,c;
833     gmx_mm_sincos_ps(x,&s,&c);
834     return s;
835 }
836
837 /*
838  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
839  * will then call the sincos() routine and waste a factor 2 in performance!
840  */
841 static __m128
842 gmx_mm_cos_ps(__m128 x)
843 {
844     __m128 s,c;
845     gmx_mm_sincos_ps(x,&s,&c);
846     return c;
847 }
848
849
850 static __m128
851 gmx_mm_tan_ps(__m128 x)
852 {
853     __m128 sinval,cosval;
854     __m128 tanval;
855
856     gmx_mm_sincos_ps(x,&sinval,&cosval);
857
858     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
859
860     return tanval;
861 }
862
863
864 static __m128
865 gmx_mm_asin_ps(__m128 x)
866 {
867     /* Same algorithm as cephes library */
868     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
869     const __m128 limitlow  = _mm_set1_ps(1e-4f);
870     const __m128 half      = _mm_set1_ps(0.5f);
871     const __m128 one       = _mm_set1_ps(1.0f);
872     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
873
874     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
875     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
876     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
877     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
878     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
879
880     __m128 sign;
881     __m128 mask;
882     __m128 xabs;
883     __m128 z,z1,z2,q,q1,q2;
884     __m128 pA,pB;
885
886     sign  = _mm_andnot_ps(signmask,x);
887     xabs  = _mm_and_ps(x,signmask);
888
889     mask  = _mm_cmp_ps(xabs,half,_CMP_GT_OQ);
890
891     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
892     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
893     q1    = _mm_andnot_ps(_mm_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
894
895     q2    = xabs;
896     z2    = _mm_mul_ps(q2,q2);
897
898     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
899     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
900
901     z2    = _mm_mul_ps(z,z);
902
903     pA    = _mm_macc_ps(CC5,z2,CC3);
904     pB    = _mm_macc_ps(CC4,z2,CC2);
905
906     pA    = _mm_macc_ps(pA,z2,CC1);
907     pA    = _mm_mul_ps(pA,z);
908
909     z     = _mm_macc_ps(pB,z2,pA);
910
911     z     = _mm_macc_ps(z,q,q);
912
913     q2    = _mm_sub_ps(halfpi,z);
914     q2    = _mm_sub_ps(q2,z);
915
916     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
917
918     mask  = _mm_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
919     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
920
921     z = _mm_xor_ps(z,sign);
922
923     return z;
924 }
925
926
927 static __m128
928 gmx_mm_acos_ps(__m128 x)
929 {
930     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
931     const __m128 one_ps    = _mm_set1_ps(1.0f);
932     const __m128 half_ps   = _mm_set1_ps(0.5f);
933     const __m128 pi_ps     = _mm_set1_ps(M_PI);
934     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
935
936     __m128 mask1;
937     __m128 mask2;
938     __m128 xabs;
939     __m128 z,z1,z2,z3;
940
941     xabs  = _mm_and_ps(x,signmask);
942     mask1 = _mm_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
943     mask2 = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_GT_OQ);
944
945     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
946     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
947     z     = _mm_andnot_ps(_mm_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
948
949     z     = _mm_blendv_ps(x,z,mask1);
950     z     = gmx_mm_asin_ps(z);
951
952     z2    = _mm_add_ps(z,z);
953     z1    = _mm_sub_ps(pi_ps,z2);
954     z3    = _mm_sub_ps(halfpi_ps,z);
955
956     z     = _mm_blendv_ps(z1,z2,mask2);
957     z     = _mm_blendv_ps(z3,z,mask1);
958
959     return z;
960 }
961
962
963 static __m128
964 gmx_mm_atan_ps(__m128 x)
965 {
966     /* Same algorithm as cephes library */
967     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
968     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
969     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
970     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
971     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
972     const __m128 mone      = _mm_set1_ps(-1.0f);
973     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
974     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
975     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
976     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
977
978     __m128 sign;
979     __m128 mask1,mask2;
980     __m128 y,z1,z2;
981     __m128 x2,x4;
982     __m128 sum1,sum2;
983
984     sign  = _mm_andnot_ps(signmask,x);
985     x     = _mm_and_ps(x,signmask);
986
987     mask1 = _mm_cmp_ps(x,limit1,_CMP_GT_OQ);
988     mask2 = _mm_cmp_ps(x,limit2,_CMP_GT_OQ);
989
990     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
991     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
992
993     y     = _mm_and_ps(mask1,quarterpi);
994     y     = _mm_blendv_ps(y,halfpi,mask2);
995
996     x     = _mm_blendv_ps(x,z1,mask1);
997     x     = _mm_blendv_ps(x,z2,mask2);
998
999     x2    = _mm_mul_ps(x,x);
1000     x4    = _mm_mul_ps(x2,x2);
1001
1002     sum1  = _mm_macc_ps(CC9,x4,CC5);
1003     sum2  = _mm_macc_ps(CC7,x4,CC3);
1004     sum1  = _mm_mul_ps(sum1,x4);
1005     sum1  = _mm_macc_ps(sum2,x2,sum1);
1006
1007     sum1  = _mm_sub_ps(sum1,mone);
1008     y     = _mm_macc_ps(sum1,x,y);
1009
1010     y     = _mm_xor_ps(y,sign);
1011
1012     return y;
1013 }
1014
1015
1016 static __m128
1017 gmx_mm_atan2_ps(__m128 y, __m128 x)
1018 {
1019     const __m128 pi          = _mm_set1_ps(M_PI);
1020     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1021     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1022     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1023
1024     __m128 z,z1,z3,z4;
1025     __m128 w;
1026     __m128 maskx_lt,maskx_eq;
1027     __m128 masky_lt,masky_eq;
1028     __m128 mask1,mask2,mask3,mask4,maskall;
1029
1030     maskx_lt  = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
1031     masky_lt  = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_LT_OQ);
1032     maskx_eq  = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
1033     masky_eq  = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_EQ_OQ);
1034
1035     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1036     z         = gmx_mm_atan_ps(z);
1037
1038     mask1     = _mm_and_ps(maskx_eq,masky_lt);
1039     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
1040     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1041     mask4     = _mm_and_ps(masky_eq,maskx_lt);
1042
1043     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1044
1045     z         = _mm_andnot_ps(maskall,z);
1046     z1        = _mm_and_ps(mask1,minushalfpi);
1047     z3        = _mm_and_ps(mask3,halfpi);
1048     z4        = _mm_and_ps(mask4,pi);
1049
1050     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1051
1052     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
1053     mask2     = _mm_and_ps(maskx_lt,masky_lt);
1054
1055     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1056     w         = _mm_andnot_ps(maskall,w);
1057
1058     z         = _mm_add_ps(z,w);
1059
1060     return z;
1061 }
1062
1063
1064
1065 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */