include/gmx_math_x86_avx_128_fma_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_math_x86_avx_128_fma_single_h_
  22 #define _gmx_math_x86_avx_128_fma_single_h_
  23
  24 #include <immintrin.h> /* AVX */
  25 #ifdef HAVE_X86INTRIN_H
  26 #include <x86intrin.h> /* FMA */
  27 #endif
  28 #ifdef HAVE_INTRIN_H
  29 #include <intrin.h> /* FMA MSVC */
  30 #endif
  31
  32 #include <math.h>
  33
  34 #include "gmx_x86_avx_128_fma.h"
  35
  36
  37 #ifndef M_PI
  38 #  define M_PI 3.14159265358979323846264338327950288
  39 #endif
  40
  41
  42
  43
  44 /************************
  45  *                      *
  46  * Simple math routines *
  47  *                      *
  48  ************************/
  49
  50 /* 1.0/sqrt(x) */
  51 static gmx_inline __m128
  52 gmx_mm_invsqrt_ps(__m128 x)
  53 {
  54     const __m128 half  = _mm_set1_ps(0.5);
  55     const __m128 one   = _mm_set1_ps(1.0);
  56
  57     __m128 lu = _mm_rsqrt_ps(x);
  58
  59     return _mm_macc_ps(_mm_nmacc_ps(x,_mm_mul_ps(lu,lu),one),_mm_mul_ps(lu,half),lu);
  60 }
  61
  62 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  63 static gmx_inline __m128
  64 gmx_mm_sqrt_ps(__m128 x)
  65 {
  66     __m128 mask;
  67     __m128 res;
  68
  69     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
  70     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
  71
  72     res  = _mm_mul_ps(x,res);
  73
  74     return res;
  75 }
  76
  77 /* 1.0/x */
  78 static gmx_inline __m128
  79 gmx_mm_inv_ps(__m128 x)
  80 {
  81     const __m128 two = _mm_set1_ps(2.0);
  82
  83     __m128 lu = _mm_rcp_ps(x);
  84
  85     return _mm_mul_ps(lu,_mm_nmacc_ps(lu,x,two));
  86 }
  87
  88 static gmx_inline __m128
  89 gmx_mm_abs_ps(__m128 x)
  90 {
  91     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
  92
  93     return _mm_and_ps(x,signmask);
  94 }
  95
  96 static __m128
  97 gmx_mm_log_ps(__m128 x)
  98 {
  99     /* Same algorithm as cephes library */
 100     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 101     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 102     const __m128  half       = _mm_set1_ps(0.5f);
 103     const __m128  one        = _mm_set1_ps(1.0f);
 104     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 105     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 106     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 107
 108     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
 109     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
 110     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
 111     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
 112     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
 113     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
 114     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
 115     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
 116     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
 117
 118     __m128  fexp,fexp1;
 119     __m128i iexp;
 120     __m128  mask;
 121     __m128  x1,x2;
 122     __m128  y;
 123     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 124
 125     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 126     fexp  = _mm_and_ps(x,expmask);
 127     iexp  = gmx_mm_castps_si128(fexp);
 128     iexp  = _mm_srli_epi32(iexp,23);
 129     iexp  = _mm_sub_epi32(iexp,expbase_m1);
 130
 131     x     = _mm_andnot_ps(expmask,x);
 132     x     = _mm_or_ps(x,one);
 133     x     = _mm_mul_ps(x,half);
 134
 135     mask  = _mm_cmp_ps(x,invsq2,_CMP_LT_OQ);
 136
 137     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
 138     x     = _mm_sub_ps(x,one);
 139     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 140
 141     x2    = _mm_mul_ps(x,x);
 142
 143     pA    = _mm_mul_ps(CA_1,x);
 144
 145     pB    = _mm_add_ps(x,CB_1);
 146     pC    = _mm_add_ps(x,CC_1);
 147     pD    = _mm_add_ps(x,CD_1);
 148     pE    = _mm_add_ps(x,CE_1);
 149
 150     pB    = _mm_macc_ps(x,pB,CB_0);
 151     pC    = _mm_macc_ps(x,pC,CC_0);
 152     pD    = _mm_macc_ps(x,pD,CD_0);
 153     pE    = _mm_macc_ps(x,pE,CE_0);
 154
 155     pA    = _mm_mul_ps(pA,pB);
 156     pC    = _mm_mul_ps(pC,pD);
 157     pE    = _mm_mul_ps(pE,x2);
 158     pA    = _mm_mul_ps(pA,pC);
 159     y     = _mm_mul_ps(pA,pE);
 160
 161     fexp  = _mm_cvtepi32_ps(iexp);
 162     y     = _mm_macc_ps(fexp,corr1,y);
 163     y     = _mm_nmacc_ps(half,x2,y);
 164
 165     x2    = _mm_add_ps(x,y);
 166     x2    = _mm_macc_ps(fexp,corr2,x2);
 167
 168     return x2;
 169 }
 170
 171
 172 /*
 173  * 2^x function.
 174  *
 175  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 176  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 177  *
 178  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 179  *
 180  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 181  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 182  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 183  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 184  * number instead. That would take a few extra cycles and not really help, since something is
 185  * wrong if you are using single precision to work with numbers that cannot really be represented
 186  * in single precision.
 187  *
 188  * The accuracy is at least 23 bits.
 189  */
 190 static __m128
 191 gmx_mm_exp2_ps(__m128 x)
 192 {
 193     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 194     const __m128  arglimit = _mm_set1_ps(126.0f);
 195
 196     const __m128i expbase  = _mm_set1_epi32(127);
 197     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 198     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 199     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 200     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 201     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 202     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 203     const __m128  CA0      = _mm_set1_ps(1.0f);
 204
 205     __m128  valuemask;
 206     __m128i iexppart;
 207     __m128  fexppart;
 208     __m128  intpart;
 209     __m128  x2;
 210     __m128  p0,p1;
 211
 212     iexppart  = _mm_cvtps_epi32(x);
 213     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 214     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 215     valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(x),_CMP_GE_OQ);
 216     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 217
 218     x         = _mm_sub_ps(x,intpart);
 219     x2        = _mm_mul_ps(x,x);
 220
 221     p0        = _mm_macc_ps(CA6,x2,CA4);
 222     p1        = _mm_macc_ps(CA5,x2,CA3);
 223     p0        = _mm_macc_ps(p0,x2,CA2);
 224     p1        = _mm_macc_ps(p1,x2,CA1);
 225     p0        = _mm_macc_ps(p0,x2,CA0);
 226     p0        = _mm_macc_ps(p1,x,p0);
 227     x         = _mm_mul_ps(p0,fexppart);
 228
 229     return  x;
 230 }
 231
 232
 233 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 234  * but there will then be a small rounding error since we lose some precision due to the
 235  * multiplication. This will then be magnified a lot by the exponential.
 236  *
 237  * Instead, we calculate the fractional part directly as a minimax approximation of
 238  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 239  * remaining after 2^y, which avoids the precision-loss.
 240  * The final result is correct to within 1 LSB over the entire argument range.
 241  */
 242 static __m128
 243 gmx_mm_exp_ps(__m128 x)
 244 {
 245     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 246     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 247     const __m128  arglimit      = _mm_set1_ps(126.0f);
 248     const __m128i expbase       = _mm_set1_epi32(127);
 249
 250     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 251     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 252
 253     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 254     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 255     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 256     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 257     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 258     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 259     const __m128  one           = _mm_set1_ps(1.0f);
 260
 261     __m128  y,x2;
 262     __m128  p0,p1;
 263     __m128  valuemask;
 264     __m128i iexppart;
 265     __m128  fexppart;
 266     __m128  intpart;
 267
 268     y = _mm_mul_ps(x,argscale);
 269
 270     iexppart  = _mm_cvtps_epi32(y);
 271     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
 272
 273     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 274     valuemask = _mm_cmp_ps(arglimit,gmx_mm_abs_ps(y),_CMP_GE_OQ);
 275     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 276
 277     /* Extended precision arithmetics */
 278     x         = _mm_nmacc_ps(invargscale0,intpart,x);
 279     x         = _mm_nmacc_ps(invargscale1,intpart,x);
 280
 281     x2        = _mm_mul_ps(x,x);
 282
 283     p1        = _mm_macc_ps(CC5,x2,CC3);
 284     p0        = _mm_macc_ps(CC4,x2,CC2);
 285     p1        = _mm_macc_ps(p1,x2,CC1);
 286     p0        = _mm_macc_ps(p0,x2,CC0);
 287     p0        = _mm_macc_ps(p1,x,p0);
 288     p0        = _mm_macc_ps(p0,x2,one);
 289
 290     x         = _mm_add_ps(x,p0);
 291
 292     x         = _mm_mul_ps(x,fexppart);
 293
 294     return x;
 295 }
 296
 297 /* FULL precision. Only errors in LSB */
 298 static __m128
 299 gmx_mm_erf_ps(__m128 x)
 300 {
 301     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 302     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 303     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 304     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 305     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 306     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 307     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 308     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 309     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 310     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 311     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 312     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 313     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 314     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 315     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 316     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 317     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 318     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 319     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 320     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 321     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 322     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 323     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 324     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 325     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 326     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 327     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 328     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 329     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 330     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 331     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 332
 333     /* Coefficients for expansion of exp(x) in [0,0.1] */
 334     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 335     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 336     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 337     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 338
 339     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 340     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 341     const __m128  one      = _mm_set1_ps(1.0f);
 342     const __m128  two      = _mm_set1_ps(2.0f);
 343
 344     __m128 x2,x4,y;
 345     __m128 z,q,t,t2,w,w2;
 346     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 347     __m128 expmx2,corr;
 348     __m128 res_erf,res_erfc,res;
 349     __m128 mask;
 350
 351     /* Calculate erf() */
 352     x2     = _mm_mul_ps(x,x);
 353     x4     = _mm_mul_ps(x2,x2);
 354
 355     pA0  = _mm_macc_ps(CA6,x4,CA4);
 356     pA1  = _mm_macc_ps(CA5,x4,CA3);
 357     pA0  = _mm_macc_ps(pA0,x4,CA2);
 358     pA1  = _mm_macc_ps(pA1,x4,CA1);
 359     pA0  = _mm_mul_ps(pA0,x4);
 360     pA0  = _mm_macc_ps(pA1,x2,pA0);
 361     /* Constant term must come last for precision reasons */
 362     pA0  = _mm_add_ps(pA0,CA0);
 363
 364     res_erf = _mm_mul_ps(x,pA0);
 365
 366     /* Calculate erfc */
 367
 368     y       = gmx_mm_abs_ps(x);
 369     t       = gmx_mm_inv_ps(y);
 370     w       = _mm_sub_ps(t,one);
 371     t2      = _mm_mul_ps(t,t);
 372     w2      = _mm_mul_ps(w,w);
 373     /*
 374      * We cannot simply calculate exp(-x2) directly in single precision, since
 375      * that will lose a couple of bits of precision due to the multiplication.
 376      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 377      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 378      *
 379      * The only drawback with this is that it requires TWO separate exponential
 380      * evaluations, which would be horrible performance-wise. However, the argument
 381      * for the second exp() call is always small, so there we simply use a
 382      * low-order minimax expansion on [0,0.1].
 383      */
 384
 385     z       = _mm_and_ps(y,sieve);
 386     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 387
 388     corr    = _mm_macc_ps(CD4,q,CD3);
 389     corr    = _mm_macc_ps(corr,q,CD2);
 390     corr    = _mm_macc_ps(corr,q,one);
 391     corr    = _mm_macc_ps(corr,q,one);
 392
 393     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 394     expmx2  = _mm_mul_ps(expmx2,corr);
 395
 396     pB1  = _mm_macc_ps(CB9,w2,CB7);
 397     pB0  = _mm_macc_ps(CB8,w2,CB6);
 398     pB1  = _mm_macc_ps(pB1,w2,CB5);
 399     pB0  = _mm_macc_ps(pB0,w2,CB4);
 400     pB1  = _mm_macc_ps(pB1,w2,CB3);
 401     pB0  = _mm_macc_ps(pB0,w2,CB2);
 402     pB1  = _mm_macc_ps(pB1,w2,CB1);
 403     pB0  = _mm_macc_ps(pB0,w2,CB0);
 404     pB0  = _mm_macc_ps(pB1,w,pB0);
 405
 406     pC0  = _mm_macc_ps(CC10,t2,CC8);
 407     pC1  = _mm_macc_ps(CC9,t2,CC7);
 408     pC0  = _mm_macc_ps(pC0,t2,CC6);
 409     pC1  = _mm_macc_ps(pC1,t2,CC5);
 410     pC0  = _mm_macc_ps(pC0,t2,CC4);
 411     pC1  = _mm_macc_ps(pC1,t2,CC3);
 412     pC0  = _mm_macc_ps(pC0,t2,CC2);
 413     pC1  = _mm_macc_ps(pC1,t2,CC1);
 414
 415     pC0  = _mm_macc_ps(pC0,t2,CC0);
 416     pC0  = _mm_macc_ps(pC1,t,pC0);
 417     pC0  = _mm_mul_ps(pC0,t);
 418
 419     /* SELECT pB0 or pC0 for erfc() */
 420     mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
 421     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 422     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 423
 424     /* erfc(x<0) = 2-erfc(|x|) */
 425     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
 426     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 427
 428     /* Select erf() or erfc() */
 429     mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
 430     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
 431
 432     return res;
 433 }
 434
 435
 436 /* FULL precision. Only errors in LSB */
 437 static __m128
 438 gmx_mm_erfc_ps(__m128 x)
 439 {
 440     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 441     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 442     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 443     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 444     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 445     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 446     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 447     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 448     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 449     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 450     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 451     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 452     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 453     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 454     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 455     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 456     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 457     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 458     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 459     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 460     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 461     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 462     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 463     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 464     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 465     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 466     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 467     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 468     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 469     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 470     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 471
 472     /* Coefficients for expansion of exp(x) in [0,0.1] */
 473     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 474     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 475     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 476     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 477
 478     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 479     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 480     const __m128  one      = _mm_set1_ps(1.0f);
 481     const __m128  two      = _mm_set1_ps(2.0f);
 482
 483     __m128 x2,x4,y;
 484     __m128 z,q,t,t2,w,w2;
 485     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 486     __m128 expmx2,corr;
 487     __m128 res_erf,res_erfc,res;
 488     __m128 mask;
 489
 490     /* Calculate erf() */
 491     x2     = _mm_mul_ps(x,x);
 492     x4     = _mm_mul_ps(x2,x2);
 493
 494     pA0  = _mm_macc_ps(CA6,x4,CA4);
 495     pA1  = _mm_macc_ps(CA5,x4,CA3);
 496     pA0  = _mm_macc_ps(pA0,x4,CA2);
 497     pA1  = _mm_macc_ps(pA1,x4,CA1);
 498     pA1  = _mm_mul_ps(pA1,x2);
 499     pA0  = _mm_macc_ps(pA0,x4,pA1);
 500     /* Constant term must come last for precision reasons */
 501     pA0  = _mm_add_ps(pA0,CA0);
 502
 503     res_erf = _mm_mul_ps(x,pA0);
 504
 505     /* Calculate erfc */
 506     y       = gmx_mm_abs_ps(x);
 507     t       = gmx_mm_inv_ps(y);
 508     w       = _mm_sub_ps(t,one);
 509     t2      = _mm_mul_ps(t,t);
 510     w2      = _mm_mul_ps(w,w);
 511     /*
 512      * We cannot simply calculate exp(-x2) directly in single precision, since
 513      * that will lose a couple of bits of precision due to the multiplication.
 514      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 515      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 516      *
 517      * The only drawback with this is that it requires TWO separate exponential
 518      * evaluations, which would be horrible performance-wise. However, the argument
 519      * for the second exp() call is always small, so there we simply use a
 520      * low-order minimax expansion on [0,0.1].
 521      */
 522
 523     z       = _mm_and_ps(y,sieve);
 524     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 525
 526     corr    = _mm_macc_ps(CD4,q,CD3);
 527     corr    = _mm_macc_ps(corr,q,CD2);
 528     corr    = _mm_macc_ps(corr,q,one);
 529     corr    = _mm_macc_ps(corr,q,one);
 530
 531     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 532     expmx2  = _mm_mul_ps(expmx2,corr);
 533
 534     pB1  = _mm_macc_ps(CB9,w2,CB7);
 535     pB0  = _mm_macc_ps(CB8,w2,CB6);
 536     pB1  = _mm_macc_ps(pB1,w2,CB5);
 537     pB0  = _mm_macc_ps(pB0,w2,CB4);
 538     pB1  = _mm_macc_ps(pB1,w2,CB3);
 539     pB0  = _mm_macc_ps(pB0,w2,CB2);
 540     pB1  = _mm_macc_ps(pB1,w2,CB1);
 541     pB0  = _mm_macc_ps(pB0,w2,CB0);
 542     pB0  = _mm_macc_ps(pB1,w,pB0);
 543
 544     pC0  = _mm_macc_ps(CC10,t2,CC8);
 545     pC1  = _mm_macc_ps(CC9,t2,CC7);
 546     pC0  = _mm_macc_ps(pC0,t2,CC6);
 547     pC1  = _mm_macc_ps(pC1,t2,CC5);
 548     pC0  = _mm_macc_ps(pC0,t2,CC4);
 549     pC1  = _mm_macc_ps(pC1,t2,CC3);
 550     pC0  = _mm_macc_ps(pC0,t2,CC2);
 551     pC1  = _mm_macc_ps(pC1,t2,CC1);
 552
 553     pC0  = _mm_macc_ps(pC0,t2,CC0);
 554     pC0  = _mm_macc_ps(pC1,t,pC0);
 555     pC0  = _mm_mul_ps(pC0,t);
 556
 557     /* SELECT pB0 or pC0 for erfc() */
 558     mask = _mm_cmp_ps(two,y,_CMP_LT_OQ);
 559     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 560     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 561
 562     /* erfc(x<0) = 2-erfc(|x|) */
 563     mask = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
 564     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 565
 566     /* Select erf() or erfc() */
 567     mask = _mm_cmp_ps(y,_mm_set1_ps(0.75f),_CMP_LT_OQ);
 568     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
 569
 570     return res;
 571 }
 572
 573
 574 /* Calculate the force correction due to PME analytically.
 575  *
 576  * This routine is meant to enable analytical evaluation of the
 577  * direct-space PME electrostatic force to avoid tables.
 578  *
 579  * The direct-space potential should be Erfc(beta*r)/r, but there
 580  * are some problems evaluating that:
 581  *
 582  * First, the error function is difficult (read: expensive) to
 583  * approxmiate accurately for intermediate to large arguments, and
 584  * this happens already in ranges of beta*r that occur in simulations.
 585  * Second, we now try to avoid calculating potentials in Gromacs but
 586  * use forces directly.
 587  *
 588  * We can simply things slight by noting that the PME part is really
 589  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 590  *
 591  * V= 1/r - Erf(beta*r)/r
 592  *
 593  * The first term we already have from the inverse square root, so
 594  * that we can leave out of this routine.
 595  *
 596  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 597  * the argument beta*r will be in the range 0.15 to ~4. Use your
 598  * favorite plotting program to realize how well-behaved Erf(z)/z is
 599  * in this range!
 600  *
 601  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 602  * However, it turns out it is more efficient to approximate f(z)/z and
 603  * then only use even powers. This is another minor optimization, since
 604  * we actually WANT f(z)/z, because it is going to be multiplied by
 605  * the vector between the two atoms to get the vectorial force. The
 606  * fastest flops are the ones we can avoid calculating!
 607  *
 608  * So, here's how it should be used:
 609  *
 610  * 1. Calculate r^2.
 611  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 612  * 3. Evaluate this routine with z^2 as the argument.
 613  * 4. The return value is the expression:
 614  *
 615  *
 616  *       2*exp(-z^2)     erf(z)
 617  *       ------------ - --------
 618  *       sqrt(Pi)*z^2      z^3
 619  *
 620  * 5. Multiply the entire expression by beta^3. This will get you
 621  *
 622  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 623  *       ------------------  - ---------------
 624  *          sqrt(Pi)*z^2            z^3
 625  *
 626  *    or, switching back to r (z=r*beta):
 627  *
 628  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 629  *       ----------------------- - -----------
 630  *            sqrt(Pi)*r^2            r^3
 631  *
 632  *
 633  *    With a bit of math exercise you should be able to confirm that
 634  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 635  *
 636  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 637  *    and you have your force (divided by r). A final multiplication
 638  *    with the vector connecting the two particles and you have your
 639  *    vectorial force to add to the particles.
 640  *
 641  */
 642 static __m128
 643 gmx_mm_pmecorrF_ps(__m128 z2)
 644 {
 645     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 646     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 647     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 648     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 649     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 650     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 651     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 652
 653     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 654     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 655     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 656     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 657     const __m128  FD0      = _mm_set1_ps(1.0f);
 658
 659     __m128 z4;
 660     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
 661
 662     z4             = _mm_mul_ps(z2,z2);
 663
 664     polyFD0        = _mm_macc_ps(FD4,z4,FD2);
 665     polyFD1        = _mm_macc_ps(FD3,z4,FD1);
 666     polyFD0        = _mm_macc_ps(polyFD0,z4,FD0);
 667     polyFD0        = _mm_macc_ps(polyFD1,z2,polyFD0);
 668
 669     polyFD0        = gmx_mm_inv_ps(polyFD0);
 670
 671     polyFN0        = _mm_macc_ps(FN6,z4,FN4);
 672     polyFN1        = _mm_macc_ps(FN5,z4,FN3);
 673     polyFN0        = _mm_macc_ps(polyFN0,z4,FN2);
 674     polyFN1        = _mm_macc_ps(polyFN1,z4,FN1);
 675     polyFN0        = _mm_macc_ps(polyFN0,z4,FN0);
 676     polyFN0        = _mm_macc_ps(polyFN1,z2,polyFN0);
 677
 678     return   _mm_mul_ps(polyFN0,polyFD0);
 679 }
 680
 681
 682
 683
 684 /* Calculate the potential correction due to PME analytically.
 685  *
 686  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 687  *
 688  * This routine calculates Erf(z)/z, although you should provide z^2
 689  * as the input argument.
 690  *
 691  * Here's how it should be used:
 692  *
 693  * 1. Calculate r^2.
 694  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 695  * 3. Evaluate this routine with z^2 as the argument.
 696  * 4. The return value is the expression:
 697  *
 698  *
 699  *        erf(z)
 700  *       --------
 701  *          z
 702  *
 703  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 704  *
 705  *       erf(r*beta)
 706  *       -----------
 707  *           r
 708  *
 709  * 6. Add the result to 1/r, multiply by the product of the charges,
 710  *    and you have your potential.
 711  */
 712 static __m128
 713 gmx_mm_pmecorrV_ps(__m128 z2)
 714 {
 715     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 716     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 717     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 718     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 719     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 720     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 721     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 722
 723     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 724     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 725     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 726     const __m128  VD0      = _mm_set1_ps(1.0f);
 727
 728     __m128 z4;
 729     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
 730
 731     z4             = _mm_mul_ps(z2,z2);
 732
 733     polyVD1        = _mm_macc_ps(VD3,z4,VD1);
 734     polyVD0        = _mm_macc_ps(VD2,z4,VD0);
 735     polyVD0        = _mm_macc_ps(polyVD1,z2,polyVD0);
 736
 737     polyVD0        = gmx_mm_inv_ps(polyVD0);
 738
 739     polyVN0        = _mm_macc_ps(VN6,z4,VN4);
 740     polyVN1        = _mm_macc_ps(VN5,z4,VN3);
 741     polyVN0        = _mm_macc_ps(polyVN0,z4,VN2);
 742     polyVN1        = _mm_macc_ps(polyVN1,z4,VN1);
 743     polyVN0        = _mm_macc_ps(polyVN0,z4,VN0);
 744     polyVN0        = _mm_macc_ps(polyVN1,z2,polyVN0);
 745
 746     return   _mm_mul_ps(polyVN0,polyVD0);
 747 }
 748
 749
 750
 751 static int
 752 gmx_mm_sincos_ps(__m128 x,
 753                  __m128 *sinval,
 754                  __m128 *cosval)
 755 {
 756     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
 757     const __m128 half        = _mm_set1_ps(0.5);
 758     const __m128 one         = _mm_set1_ps(1.0);
 759
 760     const __m128i izero      = _mm_set1_epi32(0);
 761     const __m128i ione       = _mm_set1_epi32(1);
 762     const __m128i itwo       = _mm_set1_epi32(2);
 763     const __m128i ithree     = _mm_set1_epi32(3);
 764     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 765
 766     const __m128 CA1         = _mm_set1_ps(1.5703125f);
 767     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 768     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 769
 770     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
 771     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
 772     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
 773     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
 774     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
 775     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
 776
 777     __m128  y,y2;
 778     __m128  z;
 779     __m128i iz;
 780     __m128i offset_sin,offset_cos;
 781     __m128  tmp1,tmp2;
 782     __m128  mask_sin,mask_cos;
 783     __m128  tmp_sin,tmp_cos;
 784
 785     y          = _mm_mul_ps(x,two_over_pi);
 786     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
 787
 788     iz         = _mm_cvttps_epi32(y);
 789     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
 790
 791     offset_sin = _mm_and_si128(iz,ithree);
 792     offset_cos = _mm_add_epi32(iz,ione);
 793
 794     /* Extended precision arithmethic to achieve full precision */
 795     y               = _mm_nmacc_ps(z,CA1,x);
 796     y               = _mm_nmacc_ps(z,CA2,y);
 797     y               = _mm_nmacc_ps(z,CA3,y);
 798
 799     y2              = _mm_mul_ps(y,y);
 800
 801     tmp1            = _mm_macc_ps(CC0,y2,CC1);
 802     tmp2            = _mm_macc_ps(CS0,y2,CS1);
 803     tmp1            = _mm_macc_ps(tmp1,y2,CC2);
 804     tmp2            = _mm_macc_ps(tmp2,y2,CS2);
 805
 806     tmp1            = _mm_macc_ps(tmp1,y2,one);
 807
 808     tmp2            = _mm_macc_ps(tmp2,_mm_mul_ps(y,y2),y);
 809
 810     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
 811     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
 812
 813     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
 814     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
 815
 816     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
 817     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
 818
 819     tmp1            = _mm_xor_ps(signbit,tmp_sin);
 820     tmp2            = _mm_xor_ps(signbit,tmp_cos);
 821
 822     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
 823     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
 824
 825     return 0;
 826 }
 827
 828 /*
 829  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 830  * will then call the sincos() routine and waste a factor 2 in performance!
 831  */
 832 static __m128
 833 gmx_mm_sin_ps(__m128 x)
 834 {
 835     __m128 s,c;
 836     gmx_mm_sincos_ps(x,&s,&c);
 837     return s;
 838 }
 839
 840 /*
 841  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 842  * will then call the sincos() routine and waste a factor 2 in performance!
 843  */
 844 static __m128
 845 gmx_mm_cos_ps(__m128 x)
 846 {
 847     __m128 s,c;
 848     gmx_mm_sincos_ps(x,&s,&c);
 849     return c;
 850 }
 851
 852
 853 static __m128
 854 gmx_mm_tan_ps(__m128 x)
 855 {
 856     __m128 sinval,cosval;
 857     __m128 tanval;
 858
 859     gmx_mm_sincos_ps(x,&sinval,&cosval);
 860
 861     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
 862
 863     return tanval;
 864 }
 865
 866
 867 static __m128
 868 gmx_mm_asin_ps(__m128 x)
 869 {
 870     /* Same algorithm as cephes library */
 871     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 872     const __m128 limitlow  = _mm_set1_ps(1e-4f);
 873     const __m128 half      = _mm_set1_ps(0.5f);
 874     const __m128 one       = _mm_set1_ps(1.0f);
 875     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 876
 877     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 878     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 879     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 880     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 881     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 882
 883     __m128 sign;
 884     __m128 mask;
 885     __m128 xabs;
 886     __m128 z,z1,z2,q,q1,q2;
 887     __m128 pA,pB;
 888
 889     sign  = _mm_andnot_ps(signmask,x);
 890     xabs  = _mm_and_ps(x,signmask);
 891
 892     mask  = _mm_cmp_ps(xabs,half,_CMP_GT_OQ);
 893
 894     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
 895     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
 896     q1    = _mm_andnot_ps(_mm_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
 897
 898     q2    = xabs;
 899     z2    = _mm_mul_ps(q2,q2);
 900
 901     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
 902     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
 903
 904     z2    = _mm_mul_ps(z,z);
 905
 906     pA    = _mm_macc_ps(CC5,z2,CC3);
 907     pB    = _mm_macc_ps(CC4,z2,CC2);
 908
 909     pA    = _mm_macc_ps(pA,z2,CC1);
 910     pA    = _mm_mul_ps(pA,z);
 911
 912     z     = _mm_macc_ps(pB,z2,pA);
 913
 914     z     = _mm_macc_ps(z,q,q);
 915
 916     q2    = _mm_sub_ps(halfpi,z);
 917     q2    = _mm_sub_ps(q2,z);
 918
 919     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
 920
 921     mask  = _mm_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
 922     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
 923
 924     z = _mm_xor_ps(z,sign);
 925
 926     return z;
 927 }
 928
 929
 930 static __m128
 931 gmx_mm_acos_ps(__m128 x)
 932 {
 933     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 934     const __m128 one_ps    = _mm_set1_ps(1.0f);
 935     const __m128 half_ps   = _mm_set1_ps(0.5f);
 936     const __m128 pi_ps     = _mm_set1_ps(M_PI);
 937     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
 938
 939     __m128 mask1;
 940     __m128 mask2;
 941     __m128 xabs;
 942     __m128 z,z1,z2,z3;
 943
 944     xabs  = _mm_and_ps(x,signmask);
 945     mask1 = _mm_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
 946     mask2 = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_GT_OQ);
 947
 948     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
 949     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
 950     z     = _mm_andnot_ps(_mm_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
 951
 952     z     = _mm_blendv_ps(x,z,mask1);
 953     z     = gmx_mm_asin_ps(z);
 954
 955     z2    = _mm_add_ps(z,z);
 956     z1    = _mm_sub_ps(pi_ps,z2);
 957     z3    = _mm_sub_ps(halfpi_ps,z);
 958
 959     z     = _mm_blendv_ps(z1,z2,mask2);
 960     z     = _mm_blendv_ps(z3,z,mask1);
 961
 962     return z;
 963 }
 964
 965
 966 static __m128
 967 gmx_mm_atan_ps(__m128 x)
 968 {
 969     /* Same algorithm as cephes library */
 970     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 971     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
 972     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
 973     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
 974     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
 975     const __m128 mone      = _mm_set1_ps(-1.0f);
 976     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
 977     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
 978     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
 979     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
 980
 981     __m128 sign;
 982     __m128 mask1,mask2;
 983     __m128 y,z1,z2;
 984     __m128 x2,x4;
 985     __m128 sum1,sum2;
 986
 987     sign  = _mm_andnot_ps(signmask,x);
 988     x     = _mm_and_ps(x,signmask);
 989
 990     mask1 = _mm_cmp_ps(x,limit1,_CMP_GT_OQ);
 991     mask2 = _mm_cmp_ps(x,limit2,_CMP_GT_OQ);
 992
 993     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
 994     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
 995
 996     y     = _mm_and_ps(mask1,quarterpi);
 997     y     = _mm_blendv_ps(y,halfpi,mask2);
 998
 999     x     = _mm_blendv_ps(x,z1,mask1);
1000     x     = _mm_blendv_ps(x,z2,mask2);
1001
1002     x2    = _mm_mul_ps(x,x);
1003     x4    = _mm_mul_ps(x2,x2);
1004
1005     sum1  = _mm_macc_ps(CC9,x4,CC5);
1006     sum2  = _mm_macc_ps(CC7,x4,CC3);
1007     sum1  = _mm_mul_ps(sum1,x4);
1008     sum1  = _mm_macc_ps(sum2,x2,sum1);
1009
1010     sum1  = _mm_sub_ps(sum1,mone);
1011     y     = _mm_macc_ps(sum1,x,y);
1012
1013     y     = _mm_xor_ps(y,sign);
1014
1015     return y;
1016 }
1017
1018
1019 static __m128
1020 gmx_mm_atan2_ps(__m128 y, __m128 x)
1021 {
1022     const __m128 pi          = _mm_set1_ps(M_PI);
1023     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1024     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1025     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1026
1027     __m128 z,z1,z3,z4;
1028     __m128 w;
1029     __m128 maskx_lt,maskx_eq;
1030     __m128 masky_lt,masky_eq;
1031     __m128 mask1,mask2,mask3,mask4,maskall;
1032
1033     maskx_lt  = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_LT_OQ);
1034     masky_lt  = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_LT_OQ);
1035     maskx_eq  = _mm_cmp_ps(x,_mm_setzero_ps(),_CMP_EQ_OQ);
1036     masky_eq  = _mm_cmp_ps(y,_mm_setzero_ps(),_CMP_EQ_OQ);
1037
1038     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1039     z         = gmx_mm_atan_ps(z);
1040
1041     mask1     = _mm_and_ps(maskx_eq,masky_lt);
1042     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
1043     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1044     mask4     = _mm_and_ps(masky_eq,maskx_lt);
1045
1046     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1047
1048     z         = _mm_andnot_ps(maskall,z);
1049     z1        = _mm_and_ps(mask1,minushalfpi);
1050     z3        = _mm_and_ps(mask3,halfpi);
1051     z4        = _mm_and_ps(mask4,pi);
1052
1053     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1054
1055     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
1056     mask2     = _mm_and_ps(maskx_lt,masky_lt);
1057
1058     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1059     w         = _mm_andnot_ps(maskall,w);
1060
1061     z         = _mm_add_ps(z,w);
1062
1063     return z;
1064 }
1065
1066
1067
1068 #endif /* _gmx_math_x86_avx_128_fma_single_h_ */