Merge "Simple patch for openmm_wrapper.cpp" into release-4-6
[alexxy/gromacs.git] / include / gmx_math_x86_sse4_1_single.h
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of GROMACS.
5  * Copyright (c) 2012-  
6  *
7  * Written by the Gromacs development team under coordination of
8  * David van der Spoel, Berk Hess, and Erik Lindahl.
9  *
10  * This library is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2
13  * of the License, or (at your option) any later version.
14  *
15  * To help us fund GROMACS development, we humbly ask that you cite
16  * the research papers on the package. Check out http://www.gromacs.org
17  * 
18  * And Hey:
19  * Gnomes, ROck Monsters And Chili Sauce
20  */
21 #ifndef _gmx_math_x86_sse4_1_single_h_
22 #define _gmx_math_x86_sse4_1_single_h_
23
24 #include <stdio.h>
25 #include <math.h>
26
27 #include "gmx_x86_sse4.h"
28
29
30
31 #ifndef M_PI
32 #  define M_PI 3.14159265358979323846264338327950288
33 #endif
34
35
36
37
38 /************************
39  *                      *
40  * Simple math routines *
41  *                      *
42  ************************/
43
44 /* 1.0/sqrt(x) */
45 static gmx_inline __m128
46 gmx_mm_invsqrt_ps(__m128 x)
47 {
48     const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
49     const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
50
51     __m128 lu = _mm_rsqrt_ps(x);
52
53     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
54 }
55
56 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
57 static gmx_inline __m128
58 gmx_mm_sqrt_ps(__m128 x)
59 {
60     __m128 mask;
61     __m128 res;
62
63     mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
64     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
65
66     res  = _mm_mul_ps(x,res);
67
68     return res;
69 }
70
71 /* 1.0/x */
72 static gmx_inline __m128
73 gmx_mm_inv_ps(__m128 x)
74 {
75     const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
76
77     __m128 lu = _mm_rcp_ps(x);
78
79     return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
80 }
81
82 static gmx_inline __m128
83 gmx_mm_abs_ps(__m128 x)
84 {
85     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
86
87     return _mm_and_ps(x,signmask);
88 }
89
90
91
92 static __m128
93 gmx_mm_log_ps(__m128 x)
94 {
95     /* Same algorithm as cephes library */
96     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
97     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
98     const __m128  half       = _mm_set1_ps(0.5f);
99     const __m128  one        = _mm_set1_ps(1.0f);
100     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
101     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
102     const __m128  corr2      = _mm_set1_ps(0.693359375f);
103
104     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
105     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
106     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
107     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
108     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
109     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
110     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
111     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
112     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
113
114     __m128  fexp,fexp1;
115     __m128i iexp;
116     __m128  mask;
117     __m128  x1,x2;
118     __m128  y;
119     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
120
121     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
122     fexp  = _mm_and_ps(x,expmask);
123     iexp  = gmx_mm_castps_si128(fexp);
124     iexp  = _mm_srli_epi32(iexp,23);
125     iexp  = _mm_sub_epi32(iexp,expbase_m1);
126
127     x     = _mm_andnot_ps(expmask,x);
128     x     = _mm_or_ps(x,one);
129     x     = _mm_mul_ps(x,half);
130
131     mask  = _mm_cmplt_ps(x,invsq2);
132
133     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
134     x     = _mm_sub_ps(x,one);
135     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
136
137     x2    = _mm_mul_ps(x,x);
138
139     pA    = _mm_mul_ps(CA_1,x);
140     pB    = _mm_mul_ps(CB_1,x);
141     pC    = _mm_mul_ps(CC_1,x);
142     pD    = _mm_mul_ps(CD_1,x);
143     pE    = _mm_mul_ps(CE_1,x);
144     tB    = _mm_add_ps(CB_0,x2);
145     tC    = _mm_add_ps(CC_0,x2);
146     tD    = _mm_add_ps(CD_0,x2);
147     tE    = _mm_add_ps(CE_0,x2);
148     pB    = _mm_add_ps(pB,tB);
149     pC    = _mm_add_ps(pC,tC);
150     pD    = _mm_add_ps(pD,tD);
151     pE    = _mm_add_ps(pE,tE);
152
153     pA    = _mm_mul_ps(pA,pB);
154     pC    = _mm_mul_ps(pC,pD);
155     pE    = _mm_mul_ps(pE,x2);
156     pA    = _mm_mul_ps(pA,pC);
157     y     = _mm_mul_ps(pA,pE);
158
159     fexp  = _mm_cvtepi32_ps(iexp);
160     y     = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
161
162     y     = _mm_sub_ps(y, _mm_mul_ps(half,x2));
163     x2    = _mm_add_ps(x,y);
164
165     x2    = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
166
167     return x2;
168 }
169
170
171 /*
172  * 2^x function.
173  *
174  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
175  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
176  *
177  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
178  *
179  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
180  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
181  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
182  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
183  * number instead. That would take a few extra cycles and not really help, since something is
184  * wrong if you are using single precision to work with numbers that cannot really be represented
185  * in single precision.
186  *
187  * The accuracy is at least 23 bits.
188  */
189 static __m128
190 gmx_mm_exp2_ps(__m128 x)
191 {
192     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
193     const __m128  arglimit = _mm_set1_ps(126.0f);
194
195     const __m128i expbase  = _mm_set1_epi32(127);
196     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
197     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
198     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
199     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
200     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
201     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
202     const __m128  CA0      = _mm_set1_ps(1.0f);
203
204     __m128  valuemask;
205     __m128i iexppart;
206     __m128  fexppart;
207     __m128  intpart;
208     __m128  x2;
209     __m128  p0,p1;
210
211     iexppart  = _mm_cvtps_epi32(x);
212     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
213     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
214     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
215     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
216
217     x         = _mm_sub_ps(x,intpart);
218     x2        = _mm_mul_ps(x,x);
219
220     p0        = _mm_mul_ps(CA6,x2);
221     p1        = _mm_mul_ps(CA5,x2);
222     p0        = _mm_add_ps(p0,CA4);
223     p1        = _mm_add_ps(p1,CA3);
224     p0        = _mm_mul_ps(p0,x2);
225     p1        = _mm_mul_ps(p1,x2);
226     p0        = _mm_add_ps(p0,CA2);
227     p1        = _mm_add_ps(p1,CA1);
228     p0        = _mm_mul_ps(p0,x2);
229     p1        = _mm_mul_ps(p1,x);
230     p0        = _mm_add_ps(p0,CA0);
231     p0        = _mm_add_ps(p0,p1);
232     x         = _mm_mul_ps(p0,fexppart);
233
234     return  x;
235 }
236
237
238 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
239  * but there will then be a small rounding error since we lose some precision due to the
240  * multiplication. This will then be magnified a lot by the exponential.
241  *
242  * Instead, we calculate the fractional part directly as a minimax approximation of
243  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
244  * remaining after 2^y, which avoids the precision-loss.
245  * The final result is correct to within 1 LSB over the entire argument range.
246  */
247 static __m128
248 gmx_mm_exp_ps(__m128 x)
249 {
250     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
251     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
252     const __m128  arglimit      = _mm_set1_ps(126.0f);
253     const __m128i expbase       = _mm_set1_epi32(127);
254
255     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
256     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
257
258     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
259     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
260     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
261     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
262     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
263     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
264     const __m128  one           = _mm_set1_ps(1.0f);
265
266     __m128  y,x2;
267     __m128  p0,p1;
268     __m128  valuemask;
269     __m128i iexppart;
270     __m128  fexppart;
271     __m128  intpart;
272
273     y = _mm_mul_ps(x,argscale);
274
275     iexppart  = _mm_cvtps_epi32(y);
276     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
277
278     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
279     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
280     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
281
282     /* Extended precision arithmetics */
283     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
284     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
285
286     x2        = _mm_mul_ps(x,x);
287
288     p1        = _mm_mul_ps(CC5,x2);
289     p0        = _mm_mul_ps(CC4,x2);
290     p1        = _mm_add_ps(p1,CC3);
291     p0        = _mm_add_ps(p0,CC2);
292     p1        = _mm_mul_ps(p1,x2);
293     p0        = _mm_mul_ps(p0,x2);
294     p1        = _mm_add_ps(p1,CC1);
295     p0        = _mm_add_ps(p0,CC0);
296     p1        = _mm_mul_ps(p1,x);
297     p0        = _mm_add_ps(p0,p1);
298     p0        = _mm_mul_ps(p0,x2);
299     x         = _mm_add_ps(x,one);
300     x         = _mm_add_ps(x,p0);
301
302     x         = _mm_mul_ps(x,fexppart);
303
304     return x;
305 }
306
307 /* FULL precision. Only errors in LSB */
308 static __m128
309 gmx_mm_erf_ps(__m128 x)
310 {
311     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
312     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
313     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
314     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
315     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
316     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
317     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
318     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
319     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
320     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
321     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
322     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
323     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
324     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
325     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
326     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
327     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
328     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
329     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
330     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
331     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
332     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
333     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
334     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
335     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
336     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
337     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
338     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
339     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
340     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
341     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
342
343     /* Coefficients for expansion of exp(x) in [0,0.1] */
344     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
345     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
346     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
347     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
348
349     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
350     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
351     const __m128  one      = _mm_set1_ps(1.0f);
352     const __m128  two      = _mm_set1_ps(2.0f);
353
354     __m128 x2,x4,y;
355     __m128 z,q,t,t2,w,w2;
356     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
357     __m128 expmx2,corr;
358     __m128 res_erf,res_erfc,res;
359     __m128 mask;
360
361     /* Calculate erf() */
362     x2     = _mm_mul_ps(x,x);
363     x4     = _mm_mul_ps(x2,x2);
364
365     pA0  = _mm_mul_ps(CA6,x4);
366     pA1  = _mm_mul_ps(CA5,x4);
367     pA0  = _mm_add_ps(pA0,CA4);
368     pA1  = _mm_add_ps(pA1,CA3);
369     pA0  = _mm_mul_ps(pA0,x4);
370     pA1  = _mm_mul_ps(pA1,x4);
371     pA0  = _mm_add_ps(pA0,CA2);
372     pA1  = _mm_add_ps(pA1,CA1);
373     pA0  = _mm_mul_ps(pA0,x4);
374     pA1  = _mm_mul_ps(pA1,x2);
375     pA0  = _mm_add_ps(pA0,pA1);
376     pA0  = _mm_add_ps(pA0,CA0);
377
378     res_erf = _mm_mul_ps(x,pA0);
379
380     /* Calculate erfc */
381
382     y       = gmx_mm_abs_ps(x);
383     t       = gmx_mm_inv_ps(y);
384     w       = _mm_sub_ps(t,one);
385     t2      = _mm_mul_ps(t,t);
386     w2      = _mm_mul_ps(w,w);
387     /*
388      * We cannot simply calculate exp(-x2) directly in single precision, since
389      * that will lose a couple of bits of precision due to the multiplication.
390      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
391      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
392      *
393      * The only drawback with this is that it requires TWO separate exponential
394      * evaluations, which would be horrible performance-wise. However, the argument
395      * for the second exp() call is always small, so there we simply use a
396      * low-order minimax expansion on [0,0.1].
397      */
398
399     z       = _mm_and_ps(y,sieve);
400     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
401
402     corr    = _mm_mul_ps(CD4,q);
403     corr    = _mm_add_ps(corr,CD3);
404     corr    = _mm_mul_ps(corr,q);
405     corr    = _mm_add_ps(corr,CD2);
406     corr    = _mm_mul_ps(corr,q);
407     corr    = _mm_add_ps(corr,one);
408     corr    = _mm_mul_ps(corr,q);
409     corr    = _mm_add_ps(corr,one);
410
411     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
412     expmx2  = _mm_mul_ps(expmx2,corr);
413
414     pB1  = _mm_mul_ps(CB9,w2);
415     pB0  = _mm_mul_ps(CB8,w2);
416     pB1  = _mm_add_ps(pB1,CB7);
417     pB0  = _mm_add_ps(pB0,CB6);
418     pB1  = _mm_mul_ps(pB1,w2);
419     pB0  = _mm_mul_ps(pB0,w2);
420     pB1  = _mm_add_ps(pB1,CB5);
421     pB0  = _mm_add_ps(pB0,CB4);
422     pB1  = _mm_mul_ps(pB1,w2);
423     pB0  = _mm_mul_ps(pB0,w2);
424     pB1  = _mm_add_ps(pB1,CB3);
425     pB0  = _mm_add_ps(pB0,CB2);
426     pB1  = _mm_mul_ps(pB1,w2);
427     pB0  = _mm_mul_ps(pB0,w2);
428     pB1  = _mm_add_ps(pB1,CB1);
429     pB1  = _mm_mul_ps(pB1,w);
430     pB0  = _mm_add_ps(pB0,pB1);
431     pB0  = _mm_add_ps(pB0,CB0);
432
433     pC0  = _mm_mul_ps(CC10,t2);
434     pC1  = _mm_mul_ps(CC9,t2);
435     pC0  = _mm_add_ps(pC0,CC8);
436     pC1  = _mm_add_ps(pC1,CC7);
437     pC0  = _mm_mul_ps(pC0,t2);
438     pC1  = _mm_mul_ps(pC1,t2);
439     pC0  = _mm_add_ps(pC0,CC6);
440     pC1  = _mm_add_ps(pC1,CC5);
441     pC0  = _mm_mul_ps(pC0,t2);
442     pC1  = _mm_mul_ps(pC1,t2);
443     pC0  = _mm_add_ps(pC0,CC4);
444     pC1  = _mm_add_ps(pC1,CC3);
445     pC0  = _mm_mul_ps(pC0,t2);
446     pC1  = _mm_mul_ps(pC1,t2);
447     pC0  = _mm_add_ps(pC0,CC2);
448     pC1  = _mm_add_ps(pC1,CC1);
449     pC0  = _mm_mul_ps(pC0,t2);
450     pC1  = _mm_mul_ps(pC1,t);
451     pC0  = _mm_add_ps(pC0,pC1);
452     pC0  = _mm_add_ps(pC0,CC0);
453     pC0  = _mm_mul_ps(pC0,t);
454
455     /* SELECT pB0 or pC0 for erfc() */
456     mask = _mm_cmplt_ps(two,y);
457     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
458     res_erfc = _mm_mul_ps(res_erfc,expmx2);
459
460     /* erfc(x<0) = 2-erfc(|x|) */
461     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
462     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
463
464     /* Select erf() or erfc() */
465     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
466     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
467
468     return res;
469 }
470
471
472 /* FULL precision. Only errors in LSB */
473 static __m128
474 gmx_mm_erfc_ps(__m128 x)
475 {
476     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
477     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
478     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
479     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
480     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
481     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
482     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
483     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
484     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
485     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
486     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
487     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
488     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
489     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
490     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
491     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
492     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
493     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
494     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
495     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
496     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
497     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
498     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
499     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
500     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
501     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
502     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
503     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
504     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
505     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
506     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
507
508     /* Coefficients for expansion of exp(x) in [0,0.1] */
509     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
510     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
511     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
512     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
513
514     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
515     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
516     const __m128  one      = _mm_set1_ps(1.0f);
517     const __m128  two      = _mm_set1_ps(2.0f);
518
519     __m128 x2,x4,y;
520     __m128 z,q,t,t2,w,w2;
521     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
522     __m128 expmx2,corr;
523     __m128 res_erf,res_erfc,res;
524     __m128 mask;
525
526     /* Calculate erf() */
527     x2     = _mm_mul_ps(x,x);
528     x4     = _mm_mul_ps(x2,x2);
529
530     pA0  = _mm_mul_ps(CA6,x4);
531     pA1  = _mm_mul_ps(CA5,x4);
532     pA0  = _mm_add_ps(pA0,CA4);
533     pA1  = _mm_add_ps(pA1,CA3);
534     pA0  = _mm_mul_ps(pA0,x4);
535     pA1  = _mm_mul_ps(pA1,x4);
536     pA0  = _mm_add_ps(pA0,CA2);
537     pA1  = _mm_add_ps(pA1,CA1);
538     pA0  = _mm_mul_ps(pA0,x4);
539     pA1  = _mm_mul_ps(pA1,x2);
540     pA0  = _mm_add_ps(pA0,pA1);
541     pA0  = _mm_add_ps(pA0,CA0);
542
543     res_erf = _mm_mul_ps(x,pA0);
544
545     /* Calculate erfc */
546     y       = gmx_mm_abs_ps(x);
547     t       = gmx_mm_inv_ps(y);
548     w       = _mm_sub_ps(t,one);
549     t2      = _mm_mul_ps(t,t);
550     w2      = _mm_mul_ps(w,w);
551     /*
552      * We cannot simply calculate exp(-x2) directly in single precision, since
553      * that will lose a couple of bits of precision due to the multiplication.
554      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
555      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
556      *
557      * The only drawback with this is that it requires TWO separate exponential
558      * evaluations, which would be horrible performance-wise. However, the argument
559      * for the second exp() call is always small, so there we simply use a
560      * low-order minimax expansion on [0,0.1].
561      */
562
563     z       = _mm_and_ps(y,sieve);
564     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
565
566     corr    = _mm_mul_ps(CD4,q);
567     corr    = _mm_add_ps(corr,CD3);
568     corr    = _mm_mul_ps(corr,q);
569     corr    = _mm_add_ps(corr,CD2);
570     corr    = _mm_mul_ps(corr,q);
571     corr    = _mm_add_ps(corr,one);
572     corr    = _mm_mul_ps(corr,q);
573     corr    = _mm_add_ps(corr,one);
574
575     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
576     expmx2  = _mm_mul_ps(expmx2,corr);
577
578     pB1  = _mm_mul_ps(CB9,w2);
579     pB0  = _mm_mul_ps(CB8,w2);
580     pB1  = _mm_add_ps(pB1,CB7);
581     pB0  = _mm_add_ps(pB0,CB6);
582     pB1  = _mm_mul_ps(pB1,w2);
583     pB0  = _mm_mul_ps(pB0,w2);
584     pB1  = _mm_add_ps(pB1,CB5);
585     pB0  = _mm_add_ps(pB0,CB4);
586     pB1  = _mm_mul_ps(pB1,w2);
587     pB0  = _mm_mul_ps(pB0,w2);
588     pB1  = _mm_add_ps(pB1,CB3);
589     pB0  = _mm_add_ps(pB0,CB2);
590     pB1  = _mm_mul_ps(pB1,w2);
591     pB0  = _mm_mul_ps(pB0,w2);
592     pB1  = _mm_add_ps(pB1,CB1);
593     pB1  = _mm_mul_ps(pB1,w);
594     pB0  = _mm_add_ps(pB0,pB1);
595     pB0  = _mm_add_ps(pB0,CB0);
596
597     pC0  = _mm_mul_ps(CC10,t2);
598     pC1  = _mm_mul_ps(CC9,t2);
599     pC0  = _mm_add_ps(pC0,CC8);
600     pC1  = _mm_add_ps(pC1,CC7);
601     pC0  = _mm_mul_ps(pC0,t2);
602     pC1  = _mm_mul_ps(pC1,t2);
603     pC0  = _mm_add_ps(pC0,CC6);
604     pC1  = _mm_add_ps(pC1,CC5);
605     pC0  = _mm_mul_ps(pC0,t2);
606     pC1  = _mm_mul_ps(pC1,t2);
607     pC0  = _mm_add_ps(pC0,CC4);
608     pC1  = _mm_add_ps(pC1,CC3);
609     pC0  = _mm_mul_ps(pC0,t2);
610     pC1  = _mm_mul_ps(pC1,t2);
611     pC0  = _mm_add_ps(pC0,CC2);
612     pC1  = _mm_add_ps(pC1,CC1);
613     pC0  = _mm_mul_ps(pC0,t2);
614     pC1  = _mm_mul_ps(pC1,t);
615     pC0  = _mm_add_ps(pC0,pC1);
616     pC0  = _mm_add_ps(pC0,CC0);
617     pC0  = _mm_mul_ps(pC0,t);
618
619     /* SELECT pB0 or pC0 for erfc() */
620     mask = _mm_cmplt_ps(two,y);
621     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
622     res_erfc = _mm_mul_ps(res_erfc,expmx2);
623
624     /* erfc(x<0) = 2-erfc(|x|) */
625     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
626     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
627
628     /* Select erf() or erfc() */
629     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
630     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
631
632     return res;
633 }
634
635
636 /* Calculate the force correction due to PME analytically.
637  *
638  * This routine is meant to enable analytical evaluation of the
639  * direct-space PME electrostatic force to avoid tables. 
640  *
641  * The direct-space potential should be Erfc(beta*r)/r, but there
642  * are some problems evaluating that: 
643  *
644  * First, the error function is difficult (read: expensive) to 
645  * approxmiate accurately for intermediate to large arguments, and
646  * this happens already in ranges of beta*r that occur in simulations.
647  * Second, we now try to avoid calculating potentials in Gromacs but
648  * use forces directly.
649  *
650  * We can simply things slight by noting that the PME part is really
651  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
652  *
653  * V= 1/r - Erf(beta*r)/r
654  *
655  * The first term we already have from the inverse square root, so
656  * that we can leave out of this routine. 
657  *
658  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
659  * the argument beta*r will be in the range 0.15 to ~4. Use your 
660  * favorite plotting program to realize how well-behaved Erf(z)/z is
661  * in this range! 
662  *
663  * We approximate f(z)=erf(z)/z with a rational minimax polynomial. 
664  * However, it turns out it is more efficient to approximate f(z)/z and
665  * then only use even powers. This is another minor optimization, since
666  * we actually WANT f(z)/z, because it is going to be multiplied by
667  * the vector between the two atoms to get the vectorial force. The 
668  * fastest flops are the ones we can avoid calculating!
669  * 
670  * So, here's how it should be used:
671  *
672  * 1. Calculate r^2.
673  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
674  * 3. Evaluate this routine with z^2 as the argument.
675  * 4. The return value is the expression:
676  *
677  *  
678  *       2*exp(-z^2)     erf(z)
679  *       ------------ - --------
680  *       sqrt(Pi)*z^2      z^3
681  * 
682  * 5. Multiply the entire expression by beta^3. This will get you
683  *
684  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
685  *       ------------------  - ---------------
686  *          sqrt(Pi)*z^2            z^3
687  * 
688  *    or, switching back to r (z=r*beta):
689  *
690  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
691  *       ----------------------- - -----------
692  *            sqrt(Pi)*r^2            r^3
693  *
694  *
695  *    With a bit of math exercise you should be able to confirm that
696  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
697  *
698  * 6. Add the result to 1/r^3, multiply by the product of the charges,
699  *    and you have your force (divided by r). A final multiplication
700  *    with the vector connecting the two particles and you have your
701  *    vectorial force to add to the particles.
702  *
703  */
704 static gmx_inline __m128
705 gmx_mm_pmecorrF_ps(__m128 z2)
706 {
707     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
708     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
709     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
710     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
711     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
712     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
713     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
714     
715     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
716     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
717     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
718     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
719     const __m128  FD0      = _mm_set1_ps(1.0f);
720     
721     __m128 z4;
722     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
723     
724     z4             = _mm_mul_ps(z2,z2);
725     
726     polyFD0        = _mm_mul_ps(FD4,z4);
727     polyFD1        = _mm_mul_ps(FD3,z4);
728     polyFD0        = _mm_add_ps(polyFD0,FD2);
729     polyFD1        = _mm_add_ps(polyFD1,FD1);
730     polyFD0        = _mm_mul_ps(polyFD0,z4);
731     polyFD1        = _mm_mul_ps(polyFD1,z2);
732     polyFD0        = _mm_add_ps(polyFD0,FD0);
733     polyFD0        = _mm_add_ps(polyFD0,polyFD1);
734     
735     polyFD0        = gmx_mm_inv_ps(polyFD0);
736     
737     polyFN0        = _mm_mul_ps(FN6,z4);
738     polyFN1        = _mm_mul_ps(FN5,z4);
739     polyFN0        = _mm_add_ps(polyFN0,FN4);
740     polyFN1        = _mm_add_ps(polyFN1,FN3);
741     polyFN0        = _mm_mul_ps(polyFN0,z4);
742     polyFN1        = _mm_mul_ps(polyFN1,z4);
743     polyFN0        = _mm_add_ps(polyFN0,FN2);
744     polyFN1        = _mm_add_ps(polyFN1,FN1);
745     polyFN0        = _mm_mul_ps(polyFN0,z4);
746     polyFN1        = _mm_mul_ps(polyFN1,z2);
747     polyFN0        = _mm_add_ps(polyFN0,FN0);
748     polyFN0        = _mm_add_ps(polyFN0,polyFN1);
749     
750     return   _mm_mul_ps(polyFN0,polyFD0);
751 }
752
753
754 /* Calculate the potential correction due to PME analytically.
755  *
756  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
757  *
758  * This routine calculates Erf(z)/z, although you should provide z^2
759  * as the input argument.
760  *
761  * Here's how it should be used:
762  *
763  * 1. Calculate r^2.
764  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
765  * 3. Evaluate this routine with z^2 as the argument.
766  * 4. The return value is the expression:
767  *
768  *  
769  *        erf(z)
770  *       --------
771  *          z
772  * 
773  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
774  *
775  *       erf(r*beta)
776  *       -----------
777  *           r 
778  *
779  * 6. Add the result to 1/r, multiply by the product of the charges,
780  *    and you have your potential.
781  */
782 static gmx_inline __m128
783 gmx_mm_pmecorrV_ps(__m128 z2)
784 {
785     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
786     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
787     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
788     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
789     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
790     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
791     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
792     
793     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
794     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
795     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
796     const __m128  VD0      = _mm_set1_ps(1.0f);
797     
798     __m128 z4;
799     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
800     
801     z4             = _mm_mul_ps(z2,z2);
802     
803     polyVD1        = _mm_mul_ps(VD3,z4);
804     polyVD0        = _mm_mul_ps(VD2,z4);
805     polyVD1        = _mm_add_ps(polyVD1,VD1);
806     polyVD0        = _mm_add_ps(polyVD0,VD0);
807     polyVD1        = _mm_mul_ps(polyVD1,z2);
808     polyVD0        = _mm_add_ps(polyVD0,polyVD1);
809     
810     polyVD0        = gmx_mm_inv_ps(polyVD0);
811     
812     polyVN0        = _mm_mul_ps(VN6,z4);
813     polyVN1        = _mm_mul_ps(VN5,z4);
814     polyVN0        = _mm_add_ps(polyVN0,VN4);
815     polyVN1        = _mm_add_ps(polyVN1,VN3);
816     polyVN0        = _mm_mul_ps(polyVN0,z4);
817     polyVN1        = _mm_mul_ps(polyVN1,z4);
818     polyVN0        = _mm_add_ps(polyVN0,VN2);
819     polyVN1        = _mm_add_ps(polyVN1,VN1);
820     polyVN0        = _mm_mul_ps(polyVN0,z4);
821     polyVN1        = _mm_mul_ps(polyVN1,z2);
822     polyVN0        = _mm_add_ps(polyVN0,VN0);
823     polyVN0        = _mm_add_ps(polyVN0,polyVN1);
824     
825     return   _mm_mul_ps(polyVN0,polyVD0);
826 }
827
828
829 static int
830 gmx_mm_sincos_ps(__m128 x,
831                  __m128 *sinval,
832                  __m128 *cosval)
833 {
834     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
835     const __m128 half        = _mm_set1_ps(0.5);
836     const __m128 one         = _mm_set1_ps(1.0);
837
838     const __m128i izero      = _mm_set1_epi32(0);
839     const __m128i ione       = _mm_set1_epi32(1);
840     const __m128i itwo       = _mm_set1_epi32(2);
841     const __m128i ithree     = _mm_set1_epi32(3);
842     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
843
844     const __m128 CA1         = _mm_set1_ps(1.5703125f);
845     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
846     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
847
848     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
849     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
850     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
851     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
852     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
853     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
854
855     __m128  y,y2;
856     __m128  z;
857     __m128i iz;
858     __m128i offset_sin,offset_cos;
859     __m128  tmp1,tmp2;
860     __m128  mask_sin,mask_cos;
861     __m128  tmp_sin,tmp_cos;
862
863     y          = _mm_mul_ps(x,two_over_pi);
864     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
865
866     iz         = _mm_cvttps_epi32(y);
867     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
868
869     offset_sin = _mm_and_si128(iz,ithree);
870     offset_cos = _mm_add_epi32(iz,ione);
871
872     /* Extended precision arithmethic to achieve full precision */
873     y               = _mm_mul_ps(z,CA1);
874     tmp1            = _mm_mul_ps(z,CA2);
875     tmp2            = _mm_mul_ps(z,CA3);
876     y               = _mm_sub_ps(x,y);
877     y               = _mm_sub_ps(y,tmp1);
878     y               = _mm_sub_ps(y,tmp2);
879
880     y2              = _mm_mul_ps(y,y);
881
882     tmp1            = _mm_mul_ps(CC0,y2);
883     tmp1            = _mm_add_ps(tmp1,CC1);
884     tmp2            = _mm_mul_ps(CS0,y2);
885     tmp2            = _mm_add_ps(tmp2,CS1);
886     tmp1            = _mm_mul_ps(tmp1,y2);
887     tmp1            = _mm_add_ps(tmp1,CC2);
888     tmp2            = _mm_mul_ps(tmp2,y2);
889     tmp2            = _mm_add_ps(tmp2,CS2);
890
891     tmp1            = _mm_mul_ps(tmp1,y2);
892     tmp1            = _mm_add_ps(tmp1,one);
893
894     tmp2            = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
895     tmp2            = _mm_add_ps(tmp2,y);
896
897     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
898     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
899
900     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
901     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
902
903     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
904     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
905
906     tmp1            = _mm_xor_ps(signbit,tmp_sin);
907     tmp2            = _mm_xor_ps(signbit,tmp_cos);
908
909     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
910     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
911
912     return 0;
913 }
914
915 /*
916  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
917  * will then call the sincos() routine and waste a factor 2 in performance!
918  */
919 static __m128
920 gmx_mm_sin_ps(__m128 x)
921 {
922     __m128 s,c;
923     gmx_mm_sincos_ps(x,&s,&c);
924     return s;
925 }
926
927 /*
928  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
929  * will then call the sincos() routine and waste a factor 2 in performance!
930  */
931 static __m128
932 gmx_mm_cos_ps(__m128 x)
933 {
934     __m128 s,c;
935     gmx_mm_sincos_ps(x,&s,&c);
936     return c;
937 }
938
939
940 static __m128
941 gmx_mm_tan_ps(__m128 x)
942 {
943     __m128 sinval,cosval;
944     __m128 tanval;
945
946     gmx_mm_sincos_ps(x,&sinval,&cosval);
947
948     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
949
950     return tanval;
951 }
952
953
954 static __m128
955 gmx_mm_asin_ps(__m128 x)
956 {
957     /* Same algorithm as cephes library */
958     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
959     const __m128 limitlow  = _mm_set1_ps(1e-4f);
960     const __m128 half      = _mm_set1_ps(0.5f);
961     const __m128 one       = _mm_set1_ps(1.0f);
962     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
963
964     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
965     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
966     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
967     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
968     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
969
970     __m128 sign;
971     __m128 mask;
972     __m128 xabs;
973     __m128 z,z1,z2,q,q1,q2;
974     __m128 pA,pB;
975
976     sign  = _mm_andnot_ps(signmask,x);
977     xabs  = _mm_and_ps(x,signmask);
978
979     mask  = _mm_cmpgt_ps(xabs,half);
980
981     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
982     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
983     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
984
985     q2    = xabs;
986     z2    = _mm_mul_ps(q2,q2);
987
988     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
989     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
990
991     z2    = _mm_mul_ps(z,z);
992
993     pA    = _mm_mul_ps(CC5,z2);
994     pB    = _mm_mul_ps(CC4,z2);
995
996     pA    = _mm_add_ps(pA,CC3);
997     pB    = _mm_add_ps(pB,CC2);
998
999     pA    = _mm_mul_ps(pA,z2);
1000     pB    = _mm_mul_ps(pB,z2);
1001
1002     pA    = _mm_add_ps(pA,CC1);
1003     pA    = _mm_mul_ps(pA,z);
1004
1005     z     = _mm_add_ps(pA,pB);
1006     z     = _mm_mul_ps(z,q);
1007     z     = _mm_add_ps(z,q);
1008
1009     q2    = _mm_sub_ps(halfpi,z);
1010     q2    = _mm_sub_ps(q2,z);
1011
1012     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1013
1014     mask  = _mm_cmpgt_ps(xabs,limitlow);
1015     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1016
1017     z = _mm_xor_ps(z,sign);
1018
1019     return z;
1020 }
1021
1022
1023 static __m128
1024 gmx_mm_acos_ps(__m128 x)
1025 {
1026     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1027     const __m128 one_ps    = _mm_set1_ps(1.0f);
1028     const __m128 half_ps   = _mm_set1_ps(0.5f);
1029     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1030     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1031
1032     __m128 mask1;
1033     __m128 mask2;
1034     __m128 xabs;
1035     __m128 z,z1,z2,z3;
1036
1037     xabs  = _mm_and_ps(x,signmask);
1038     mask1 = _mm_cmpgt_ps(xabs,half_ps);
1039     mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1040
1041     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
1042     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
1043     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
1044
1045     z     = _mm_blendv_ps(x,z,mask1);
1046     z     = gmx_mm_asin_ps(z);
1047
1048     z2    = _mm_add_ps(z,z);
1049     z1    = _mm_sub_ps(pi_ps,z2);
1050     z3    = _mm_sub_ps(halfpi_ps,z);
1051
1052     z     = _mm_blendv_ps(z1,z2,mask2);
1053     z     = _mm_blendv_ps(z3,z,mask1);
1054
1055     return z;
1056 }
1057
1058
1059 static __m128
1060 gmx_mm_atan_ps(__m128 x)
1061 {
1062     /* Same algorithm as cephes library */
1063     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1064     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
1065     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
1066     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1067     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
1068     const __m128 mone      = _mm_set1_ps(-1.0f);
1069     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
1070     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
1071     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
1072     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
1073
1074     __m128 sign;
1075     __m128 mask1,mask2;
1076     __m128 y,z1,z2;
1077     __m128 x2,x4;
1078     __m128 sum1,sum2;
1079
1080     sign  = _mm_andnot_ps(signmask,x);
1081     x     = _mm_and_ps(x,signmask);
1082
1083     mask1 = _mm_cmpgt_ps(x,limit1);
1084     mask2 = _mm_cmpgt_ps(x,limit2);
1085
1086     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
1087     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
1088
1089     y     = _mm_and_ps(mask1,quarterpi);
1090     y     = _mm_blendv_ps(y,halfpi,mask2);
1091
1092     x     = _mm_blendv_ps(x,z1,mask1);
1093     x     = _mm_blendv_ps(x,z2,mask2);
1094
1095     x2    = _mm_mul_ps(x,x);
1096     x4    = _mm_mul_ps(x2,x2);
1097
1098     sum1  = _mm_mul_ps(CC9,x4);
1099     sum2  = _mm_mul_ps(CC7,x4);
1100     sum1  = _mm_add_ps(sum1,CC5);
1101     sum2  = _mm_add_ps(sum2,CC3);
1102     sum1  = _mm_mul_ps(sum1,x4);
1103     sum2  = _mm_mul_ps(sum2,x2);
1104
1105     sum1  = _mm_add_ps(sum1,sum2);
1106     sum1  = _mm_sub_ps(sum1,mone);
1107     sum1  = _mm_mul_ps(sum1,x);
1108     y     = _mm_add_ps(y,sum1);
1109
1110     y     = _mm_xor_ps(y,sign);
1111
1112     return y;
1113 }
1114
1115
1116 static __m128
1117 gmx_mm_atan2_ps(__m128 y, __m128 x)
1118 {
1119     const __m128 pi          = _mm_set1_ps(M_PI);
1120     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1121     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1122     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1123
1124     __m128 z,z1,z3,z4;
1125     __m128 w;
1126     __m128 maskx_lt,maskx_eq;
1127     __m128 masky_lt,masky_eq;
1128     __m128 mask1,mask2,mask3,mask4,maskall;
1129
1130     maskx_lt  = _mm_cmplt_ps(x,_mm_setzero_ps());
1131     masky_lt  = _mm_cmplt_ps(y,_mm_setzero_ps());
1132     maskx_eq  = _mm_cmpeq_ps(x,_mm_setzero_ps());
1133     masky_eq  = _mm_cmpeq_ps(y,_mm_setzero_ps());
1134
1135     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1136     z         = gmx_mm_atan_ps(z);
1137
1138     mask1     = _mm_and_ps(maskx_eq,masky_lt);
1139     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
1140     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1141     mask4     = _mm_and_ps(masky_eq,maskx_lt);
1142
1143     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1144
1145     z         = _mm_andnot_ps(maskall,z);
1146     z1        = _mm_and_ps(mask1,minushalfpi);
1147     z3        = _mm_and_ps(mask3,halfpi);
1148     z4        = _mm_and_ps(mask4,pi);
1149
1150     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1151
1152     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
1153     mask2     = _mm_and_ps(maskx_lt,masky_lt);
1154
1155     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1156     w         = _mm_andnot_ps(maskall,w);
1157
1158     z         = _mm_add_ps(z,w);
1159
1160     return z;
1161 }
1162
1163
1164
1165 #endif /* _gmx_math_x86_sse4_1_single_h_ */