src/gromacs/simd/math_x86_avx_128_fma_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
  36 #define GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
  37
  38 #include <immintrin.h> /* AVX */
  39 #ifdef HAVE_X86INTRIN_H
  40 #include <x86intrin.h> /* FMA */
  41 #endif
  42 #ifdef HAVE_INTRIN_H
  43 #include <intrin.h> /* FMA MSVC */
  44 #endif
  45
  46 #include <math.h>
  47
  48 #include "general_x86_avx_128_fma.h"
  49
  50
  51 #ifndef M_PI
  52 #  define M_PI 3.14159265358979323846264338327950288
  53 #endif
  54
  55
  56
  57
  58 /************************
  59  *                      *
  60  * Simple math routines *
  61  *                      *
  62  ************************/
  63
  64 /* 1.0/sqrt(x) */
  65 static gmx_inline __m128
  66 gmx_mm_invsqrt_ps(__m128 x)
  67 {
  68     const __m128 half  = _mm_set1_ps(0.5);
  69     const __m128 one   = _mm_set1_ps(1.0);
  70
  71     __m128       lu = _mm_rsqrt_ps(x);
  72
  73     return _mm_macc_ps(_mm_nmacc_ps(x, _mm_mul_ps(lu, lu), one), _mm_mul_ps(lu, half), lu);
  74 }
  75
  76 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  77 static gmx_inline __m128
  78 gmx_mm_sqrt_ps(__m128 x)
  79 {
  80     __m128 mask;
  81     __m128 res;
  82
  83     mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
  84     res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
  85
  86     res  = _mm_mul_ps(x, res);
  87
  88     return res;
  89 }
  90
  91 /* 1.0/x */
  92 static gmx_inline __m128
  93 gmx_mm_inv_ps(__m128 x)
  94 {
  95     const __m128 two = _mm_set1_ps(2.0);
  96
  97     __m128       lu = _mm_rcp_ps(x);
  98
  99     return _mm_mul_ps(lu, _mm_nmacc_ps(lu, x, two));
 100 }
 101
 102 static gmx_inline __m128
 103 gmx_mm_abs_ps(__m128 x)
 104 {
 105     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 106
 107     return _mm_and_ps(x, signmask);
 108 }
 109
 110 static __m128
 111 gmx_mm_log_ps(__m128 x)
 112 {
 113     /* Same algorithm as cephes library */
 114     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 115     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 116     const __m128  half       = _mm_set1_ps(0.5f);
 117     const __m128  one        = _mm_set1_ps(1.0f);
 118     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 119     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 120     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 121
 122     const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 123     const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 124     const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 125     const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 126     const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 127     const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 128     const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 129     const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 130     const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 131
 132     __m128        fexp, fexp1;
 133     __m128i       iexp;
 134     __m128        mask;
 135     __m128        x1, x2;
 136     __m128        y;
 137     __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 138
 139     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 140     fexp  = _mm_and_ps(x, expmask);
 141     iexp  = gmx_mm_castps_si128(fexp);
 142     iexp  = _mm_srli_epi32(iexp, 23);
 143     iexp  = _mm_sub_epi32(iexp, expbase_m1);
 144
 145     x     = _mm_andnot_ps(expmask, x);
 146     x     = _mm_or_ps(x, one);
 147     x     = _mm_mul_ps(x, half);
 148
 149     mask  = _mm_cmp_ps(x, invsq2, _CMP_LT_OQ);
 150
 151     x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 152     x     = _mm_sub_ps(x, one);
 153     iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 154
 155     x2    = _mm_mul_ps(x, x);
 156
 157     pA    = _mm_mul_ps(CA_1, x);
 158
 159     pB    = _mm_add_ps(x, CB_1);
 160     pC    = _mm_add_ps(x, CC_1);
 161     pD    = _mm_add_ps(x, CD_1);
 162     pE    = _mm_add_ps(x, CE_1);
 163
 164     pB    = _mm_macc_ps(x, pB, CB_0);
 165     pC    = _mm_macc_ps(x, pC, CC_0);
 166     pD    = _mm_macc_ps(x, pD, CD_0);
 167     pE    = _mm_macc_ps(x, pE, CE_0);
 168
 169     pA    = _mm_mul_ps(pA, pB);
 170     pC    = _mm_mul_ps(pC, pD);
 171     pE    = _mm_mul_ps(pE, x2);
 172     pA    = _mm_mul_ps(pA, pC);
 173     y     = _mm_mul_ps(pA, pE);
 174
 175     fexp  = _mm_cvtepi32_ps(iexp);
 176     y     = _mm_macc_ps(fexp, corr1, y);
 177     y     = _mm_nmacc_ps(half, x2, y);
 178
 179     x2    = _mm_add_ps(x, y);
 180     x2    = _mm_macc_ps(fexp, corr2, x2);
 181
 182     return x2;
 183 }
 184
 185
 186 /*
 187  * 2^x function.
 188  *
 189  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 190  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 191  *
 192  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 193  *
 194  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 195  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 196  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 197  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 198  * number instead. That would take a few extra cycles and not really help, since something is
 199  * wrong if you are using single precision to work with numbers that cannot really be represented
 200  * in single precision.
 201  *
 202  * The accuracy is at least 23 bits.
 203  */
 204 static __m128
 205 gmx_mm_exp2_ps(__m128 x)
 206 {
 207     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 208     const __m128  arglimit = _mm_set1_ps(126.0f);
 209
 210     const __m128i expbase  = _mm_set1_epi32(127);
 211     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 212     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 213     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 214     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 215     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 216     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 217     const __m128  CA0      = _mm_set1_ps(1.0f);
 218
 219     __m128        valuemask;
 220     __m128i       iexppart;
 221     __m128        fexppart;
 222     __m128        intpart;
 223     __m128        x2;
 224     __m128        p0, p1;
 225
 226     iexppart  = _mm_cvtps_epi32(x);
 227     intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 228     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 229     valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(x), _CMP_GE_OQ);
 230     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 231
 232     x         = _mm_sub_ps(x, intpart);
 233     x2        = _mm_mul_ps(x, x);
 234
 235     p0        = _mm_macc_ps(CA6, x2, CA4);
 236     p1        = _mm_macc_ps(CA5, x2, CA3);
 237     p0        = _mm_macc_ps(p0, x2, CA2);
 238     p1        = _mm_macc_ps(p1, x2, CA1);
 239     p0        = _mm_macc_ps(p0, x2, CA0);
 240     p0        = _mm_macc_ps(p1, x, p0);
 241     x         = _mm_mul_ps(p0, fexppart);
 242
 243     return x;
 244 }
 245
 246
 247 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 248  * but there will then be a small rounding error since we lose some precision due to the
 249  * multiplication. This will then be magnified a lot by the exponential.
 250  *
 251  * Instead, we calculate the fractional part directly as a minimax approximation of
 252  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 253  * remaining after 2^y, which avoids the precision-loss.
 254  * The final result is correct to within 1 LSB over the entire argument range.
 255  */
 256 static __m128
 257 gmx_mm_exp_ps(__m128 x)
 258 {
 259     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 260     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 261     const __m128  arglimit      = _mm_set1_ps(126.0f);
 262     const __m128i expbase       = _mm_set1_epi32(127);
 263
 264     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 265     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 266
 267     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 268     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 269     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 270     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 271     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 272     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 273     const __m128  one           = _mm_set1_ps(1.0f);
 274
 275     __m128        y, x2;
 276     __m128        p0, p1;
 277     __m128        valuemask;
 278     __m128i       iexppart;
 279     __m128        fexppart;
 280     __m128        intpart;
 281
 282     y = _mm_mul_ps(x, argscale);
 283
 284     iexppart  = _mm_cvtps_epi32(y);
 285     intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
 286
 287     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 288     valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(y), _CMP_GE_OQ);
 289     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 290
 291     /* Extended precision arithmetics */
 292     x         = _mm_nmacc_ps(invargscale0, intpart, x);
 293     x         = _mm_nmacc_ps(invargscale1, intpart, x);
 294
 295     x2        = _mm_mul_ps(x, x);
 296
 297     p1        = _mm_macc_ps(CC5, x2, CC3);
 298     p0        = _mm_macc_ps(CC4, x2, CC2);
 299     p1        = _mm_macc_ps(p1, x2, CC1);
 300     p0        = _mm_macc_ps(p0, x2, CC0);
 301     p0        = _mm_macc_ps(p1, x, p0);
 302     p0        = _mm_macc_ps(p0, x2, one);
 303
 304     x         = _mm_add_ps(x, p0);
 305
 306     x         = _mm_mul_ps(x, fexppart);
 307
 308     return x;
 309 }
 310
 311 /* FULL precision. Only errors in LSB */
 312 static __m128
 313 gmx_mm_erf_ps(__m128 x)
 314 {
 315     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 316     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 317     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 318     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 319     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 320     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 321     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 322     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 323     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 324     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 325     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 326     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 327     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 328     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 329     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 330     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 331     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 332     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 333     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 334     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 335     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 336     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 337     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 338     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 339     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 340     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 341     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 342     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 343     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 344     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 345     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 346
 347     /* Coefficients for expansion of exp(x) in [0,0.1] */
 348     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 349     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 350     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 351     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 352
 353     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 354     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 355     const __m128  one      = _mm_set1_ps(1.0f);
 356     const __m128  two      = _mm_set1_ps(2.0f);
 357
 358     __m128        x2, x4, y;
 359     __m128        z, q, t, t2, w, w2;
 360     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 361     __m128        expmx2, corr;
 362     __m128        res_erf, res_erfc, res;
 363     __m128        mask;
 364
 365     /* Calculate erf() */
 366     x2     = _mm_mul_ps(x, x);
 367     x4     = _mm_mul_ps(x2, x2);
 368
 369     pA0  = _mm_macc_ps(CA6, x4, CA4);
 370     pA1  = _mm_macc_ps(CA5, x4, CA3);
 371     pA0  = _mm_macc_ps(pA0, x4, CA2);
 372     pA1  = _mm_macc_ps(pA1, x4, CA1);
 373     pA0  = _mm_mul_ps(pA0, x4);
 374     pA0  = _mm_macc_ps(pA1, x2, pA0);
 375     /* Constant term must come last for precision reasons */
 376     pA0  = _mm_add_ps(pA0, CA0);
 377
 378     res_erf = _mm_mul_ps(x, pA0);
 379
 380     /* Calculate erfc */
 381
 382     y       = gmx_mm_abs_ps(x);
 383     t       = gmx_mm_inv_ps(y);
 384     w       = _mm_sub_ps(t, one);
 385     t2      = _mm_mul_ps(t, t);
 386     w2      = _mm_mul_ps(w, w);
 387     /*
 388      * We cannot simply calculate exp(-x2) directly in single precision, since
 389      * that will lose a couple of bits of precision due to the multiplication.
 390      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 391      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 392      *
 393      * The only drawback with this is that it requires TWO separate exponential
 394      * evaluations, which would be horrible performance-wise. However, the argument
 395      * for the second exp() call is always small, so there we simply use a
 396      * low-order minimax expansion on [0,0.1].
 397      */
 398
 399     z       = _mm_and_ps(y, sieve);
 400     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 401
 402     corr    = _mm_macc_ps(CD4, q, CD3);
 403     corr    = _mm_macc_ps(corr, q, CD2);
 404     corr    = _mm_macc_ps(corr, q, one);
 405     corr    = _mm_macc_ps(corr, q, one);
 406
 407     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 408     expmx2  = _mm_mul_ps(expmx2, corr);
 409
 410     pB1  = _mm_macc_ps(CB9, w2, CB7);
 411     pB0  = _mm_macc_ps(CB8, w2, CB6);
 412     pB1  = _mm_macc_ps(pB1, w2, CB5);
 413     pB0  = _mm_macc_ps(pB0, w2, CB4);
 414     pB1  = _mm_macc_ps(pB1, w2, CB3);
 415     pB0  = _mm_macc_ps(pB0, w2, CB2);
 416     pB1  = _mm_macc_ps(pB1, w2, CB1);
 417     pB0  = _mm_macc_ps(pB0, w2, CB0);
 418     pB0  = _mm_macc_ps(pB1, w, pB0);
 419
 420     pC0  = _mm_macc_ps(CC10, t2, CC8);
 421     pC1  = _mm_macc_ps(CC9, t2, CC7);
 422     pC0  = _mm_macc_ps(pC0, t2, CC6);
 423     pC1  = _mm_macc_ps(pC1, t2, CC5);
 424     pC0  = _mm_macc_ps(pC0, t2, CC4);
 425     pC1  = _mm_macc_ps(pC1, t2, CC3);
 426     pC0  = _mm_macc_ps(pC0, t2, CC2);
 427     pC1  = _mm_macc_ps(pC1, t2, CC1);
 428
 429     pC0  = _mm_macc_ps(pC0, t2, CC0);
 430     pC0  = _mm_macc_ps(pC1, t, pC0);
 431     pC0  = _mm_mul_ps(pC0, t);
 432
 433     /* SELECT pB0 or pC0 for erfc() */
 434     mask     = _mm_cmp_ps(two, y, _CMP_LT_OQ);
 435     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 436     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 437
 438     /* erfc(x<0) = 2-erfc(|x|) */
 439     mask     = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
 440     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 441
 442     /* Select erf() or erfc() */
 443     mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
 444     res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
 445
 446     return res;
 447 }
 448
 449
 450 /* FULL precision. Only errors in LSB */
 451 static __m128
 452 gmx_mm_erfc_ps(__m128 x)
 453 {
 454     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 455     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 456     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 457     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 458     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 459     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 460     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 461     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 462     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 463     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 464     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 465     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 466     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 467     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 468     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 469     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 470     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 471     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 472     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 473     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 474     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 475     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 476     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 477     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 478     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 479     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 480     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 481     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 482     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 483     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 484     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 485
 486     /* Coefficients for expansion of exp(x) in [0,0.1] */
 487     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 488     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 489     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 490     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 491
 492     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 493     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 494     const __m128  one      = _mm_set1_ps(1.0f);
 495     const __m128  two      = _mm_set1_ps(2.0f);
 496
 497     __m128        x2, x4, y;
 498     __m128        z, q, t, t2, w, w2;
 499     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 500     __m128        expmx2, corr;
 501     __m128        res_erf, res_erfc, res;
 502     __m128        mask;
 503
 504     /* Calculate erf() */
 505     x2     = _mm_mul_ps(x, x);
 506     x4     = _mm_mul_ps(x2, x2);
 507
 508     pA0  = _mm_macc_ps(CA6, x4, CA4);
 509     pA1  = _mm_macc_ps(CA5, x4, CA3);
 510     pA0  = _mm_macc_ps(pA0, x4, CA2);
 511     pA1  = _mm_macc_ps(pA1, x4, CA1);
 512     pA1  = _mm_mul_ps(pA1, x2);
 513     pA0  = _mm_macc_ps(pA0, x4, pA1);
 514     /* Constant term must come last for precision reasons */
 515     pA0  = _mm_add_ps(pA0, CA0);
 516
 517     res_erf = _mm_mul_ps(x, pA0);
 518
 519     /* Calculate erfc */
 520     y       = gmx_mm_abs_ps(x);
 521     t       = gmx_mm_inv_ps(y);
 522     w       = _mm_sub_ps(t, one);
 523     t2      = _mm_mul_ps(t, t);
 524     w2      = _mm_mul_ps(w, w);
 525     /*
 526      * We cannot simply calculate exp(-x2) directly in single precision, since
 527      * that will lose a couple of bits of precision due to the multiplication.
 528      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 529      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 530      *
 531      * The only drawback with this is that it requires TWO separate exponential
 532      * evaluations, which would be horrible performance-wise. However, the argument
 533      * for the second exp() call is always small, so there we simply use a
 534      * low-order minimax expansion on [0,0.1].
 535      */
 536
 537     z       = _mm_and_ps(y, sieve);
 538     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 539
 540     corr    = _mm_macc_ps(CD4, q, CD3);
 541     corr    = _mm_macc_ps(corr, q, CD2);
 542     corr    = _mm_macc_ps(corr, q, one);
 543     corr    = _mm_macc_ps(corr, q, one);
 544
 545     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 546     expmx2  = _mm_mul_ps(expmx2, corr);
 547
 548     pB1  = _mm_macc_ps(CB9, w2, CB7);
 549     pB0  = _mm_macc_ps(CB8, w2, CB6);
 550     pB1  = _mm_macc_ps(pB1, w2, CB5);
 551     pB0  = _mm_macc_ps(pB0, w2, CB4);
 552     pB1  = _mm_macc_ps(pB1, w2, CB3);
 553     pB0  = _mm_macc_ps(pB0, w2, CB2);
 554     pB1  = _mm_macc_ps(pB1, w2, CB1);
 555     pB0  = _mm_macc_ps(pB0, w2, CB0);
 556     pB0  = _mm_macc_ps(pB1, w, pB0);
 557
 558     pC0  = _mm_macc_ps(CC10, t2, CC8);
 559     pC1  = _mm_macc_ps(CC9, t2, CC7);
 560     pC0  = _mm_macc_ps(pC0, t2, CC6);
 561     pC1  = _mm_macc_ps(pC1, t2, CC5);
 562     pC0  = _mm_macc_ps(pC0, t2, CC4);
 563     pC1  = _mm_macc_ps(pC1, t2, CC3);
 564     pC0  = _mm_macc_ps(pC0, t2, CC2);
 565     pC1  = _mm_macc_ps(pC1, t2, CC1);
 566
 567     pC0  = _mm_macc_ps(pC0, t2, CC0);
 568     pC0  = _mm_macc_ps(pC1, t, pC0);
 569     pC0  = _mm_mul_ps(pC0, t);
 570
 571     /* SELECT pB0 or pC0 for erfc() */
 572     mask     = _mm_cmp_ps(two, y, _CMP_LT_OQ);
 573     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 574     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 575
 576     /* erfc(x<0) = 2-erfc(|x|) */
 577     mask     = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
 578     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 579
 580     /* Select erf() or erfc() */
 581     mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
 582     res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
 583
 584     return res;
 585 }
 586
 587
 588 /* Calculate the force correction due to PME analytically.
 589  *
 590  * This routine is meant to enable analytical evaluation of the
 591  * direct-space PME electrostatic force to avoid tables.
 592  *
 593  * The direct-space potential should be Erfc(beta*r)/r, but there
 594  * are some problems evaluating that:
 595  *
 596  * First, the error function is difficult (read: expensive) to
 597  * approxmiate accurately for intermediate to large arguments, and
 598  * this happens already in ranges of beta*r that occur in simulations.
 599  * Second, we now try to avoid calculating potentials in Gromacs but
 600  * use forces directly.
 601  *
 602  * We can simply things slight by noting that the PME part is really
 603  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 604  *
 605  * V= 1/r - Erf(beta*r)/r
 606  *
 607  * The first term we already have from the inverse square root, so
 608  * that we can leave out of this routine.
 609  *
 610  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 611  * the argument beta*r will be in the range 0.15 to ~4. Use your
 612  * favorite plotting program to realize how well-behaved Erf(z)/z is
 613  * in this range!
 614  *
 615  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 616  * However, it turns out it is more efficient to approximate f(z)/z and
 617  * then only use even powers. This is another minor optimization, since
 618  * we actually WANT f(z)/z, because it is going to be multiplied by
 619  * the vector between the two atoms to get the vectorial force. The
 620  * fastest flops are the ones we can avoid calculating!
 621  *
 622  * So, here's how it should be used:
 623  *
 624  * 1. Calculate r^2.
 625  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 626  * 3. Evaluate this routine with z^2 as the argument.
 627  * 4. The return value is the expression:
 628  *
 629  *
 630  *       2*exp(-z^2)     erf(z)
 631  *       ------------ - --------
 632  *       sqrt(Pi)*z^2      z^3
 633  *
 634  * 5. Multiply the entire expression by beta^3. This will get you
 635  *
 636  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 637  *       ------------------  - ---------------
 638  *          sqrt(Pi)*z^2            z^3
 639  *
 640  *    or, switching back to r (z=r*beta):
 641  *
 642  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 643  *       ----------------------- - -----------
 644  *            sqrt(Pi)*r^2            r^3
 645  *
 646  *
 647  *    With a bit of math exercise you should be able to confirm that
 648  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 649  *
 650  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 651  *    and you have your force (divided by r). A final multiplication
 652  *    with the vector connecting the two particles and you have your
 653  *    vectorial force to add to the particles.
 654  *
 655  */
 656 static __m128
 657 gmx_mm_pmecorrF_ps(__m128 z2)
 658 {
 659     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 660     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 661     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 662     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 663     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 664     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 665     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 666
 667     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 668     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 669     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 670     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 671     const __m128  FD0      = _mm_set1_ps(1.0f);
 672
 673     __m128        z4;
 674     __m128        polyFN0, polyFN1, polyFD0, polyFD1;
 675
 676     z4             = _mm_mul_ps(z2, z2);
 677
 678     polyFD0        = _mm_macc_ps(FD4, z4, FD2);
 679     polyFD1        = _mm_macc_ps(FD3, z4, FD1);
 680     polyFD0        = _mm_macc_ps(polyFD0, z4, FD0);
 681     polyFD0        = _mm_macc_ps(polyFD1, z2, polyFD0);
 682
 683     polyFD0        = gmx_mm_inv_ps(polyFD0);
 684
 685     polyFN0        = _mm_macc_ps(FN6, z4, FN4);
 686     polyFN1        = _mm_macc_ps(FN5, z4, FN3);
 687     polyFN0        = _mm_macc_ps(polyFN0, z4, FN2);
 688     polyFN1        = _mm_macc_ps(polyFN1, z4, FN1);
 689     polyFN0        = _mm_macc_ps(polyFN0, z4, FN0);
 690     polyFN0        = _mm_macc_ps(polyFN1, z2, polyFN0);
 691
 692     return _mm_mul_ps(polyFN0, polyFD0);
 693 }
 694
 695
 696
 697
 698 /* Calculate the potential correction due to PME analytically.
 699  *
 700  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 701  *
 702  * This routine calculates Erf(z)/z, although you should provide z^2
 703  * as the input argument.
 704  *
 705  * Here's how it should be used:
 706  *
 707  * 1. Calculate r^2.
 708  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 709  * 3. Evaluate this routine with z^2 as the argument.
 710  * 4. The return value is the expression:
 711  *
 712  *
 713  *        erf(z)
 714  *       --------
 715  *          z
 716  *
 717  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 718  *
 719  *       erf(r*beta)
 720  *       -----------
 721  *           r
 722  *
 723  * 6. Add the result to 1/r, multiply by the product of the charges,
 724  *    and you have your potential.
 725  */
 726 static __m128
 727 gmx_mm_pmecorrV_ps(__m128 z2)
 728 {
 729     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 730     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 731     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 732     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 733     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 734     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 735     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 736
 737     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 738     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 739     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 740     const __m128  VD0      = _mm_set1_ps(1.0f);
 741
 742     __m128        z4;
 743     __m128        polyVN0, polyVN1, polyVD0, polyVD1;
 744
 745     z4             = _mm_mul_ps(z2, z2);
 746
 747     polyVD1        = _mm_macc_ps(VD3, z4, VD1);
 748     polyVD0        = _mm_macc_ps(VD2, z4, VD0);
 749     polyVD0        = _mm_macc_ps(polyVD1, z2, polyVD0);
 750
 751     polyVD0        = gmx_mm_inv_ps(polyVD0);
 752
 753     polyVN0        = _mm_macc_ps(VN6, z4, VN4);
 754     polyVN1        = _mm_macc_ps(VN5, z4, VN3);
 755     polyVN0        = _mm_macc_ps(polyVN0, z4, VN2);
 756     polyVN1        = _mm_macc_ps(polyVN1, z4, VN1);
 757     polyVN0        = _mm_macc_ps(polyVN0, z4, VN0);
 758     polyVN0        = _mm_macc_ps(polyVN1, z2, polyVN0);
 759
 760     return _mm_mul_ps(polyVN0, polyVD0);
 761 }
 762
 763
 764
 765 static int
 766 gmx_mm_sincos_ps(__m128  x,
 767                  __m128 *sinval,
 768                  __m128 *cosval)
 769 {
 770     const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
 771     const __m128  half        = _mm_set1_ps(0.5);
 772     const __m128  one         = _mm_set1_ps(1.0);
 773
 774     const __m128i izero      = _mm_set1_epi32(0);
 775     const __m128i ione       = _mm_set1_epi32(1);
 776     const __m128i itwo       = _mm_set1_epi32(2);
 777     const __m128i ithree     = _mm_set1_epi32(3);
 778     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 779
 780     const __m128  CA1         = _mm_set1_ps(1.5703125f);
 781     const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 782     const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 783
 784     const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
 785     const __m128  CC1         = _mm_set1_ps(0.0416566950f);
 786     const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
 787     const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
 788     const __m128  CS1         = _mm_set1_ps(0.0083320758f);
 789     const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
 790
 791     __m128        y, y2;
 792     __m128        z;
 793     __m128i       iz;
 794     __m128i       offset_sin, offset_cos;
 795     __m128        tmp1, tmp2;
 796     __m128        mask_sin, mask_cos;
 797     __m128        tmp_sin, tmp_cos;
 798
 799     y          = _mm_mul_ps(x, two_over_pi);
 800     y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
 801
 802     iz         = _mm_cvttps_epi32(y);
 803     z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
 804
 805     offset_sin = _mm_and_si128(iz, ithree);
 806     offset_cos = _mm_add_epi32(iz, ione);
 807
 808     /* Extended precision arithmethic to achieve full precision */
 809     y               = _mm_nmacc_ps(z, CA1, x);
 810     y               = _mm_nmacc_ps(z, CA2, y);
 811     y               = _mm_nmacc_ps(z, CA3, y);
 812
 813     y2              = _mm_mul_ps(y, y);
 814
 815     tmp1            = _mm_macc_ps(CC0, y2, CC1);
 816     tmp2            = _mm_macc_ps(CS0, y2, CS1);
 817     tmp1            = _mm_macc_ps(tmp1, y2, CC2);
 818     tmp2            = _mm_macc_ps(tmp2, y2, CS2);
 819
 820     tmp1            = _mm_macc_ps(tmp1, y2, one);
 821
 822     tmp2            = _mm_macc_ps(tmp2, _mm_mul_ps(y, y2), y);
 823
 824     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
 825     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
 826
 827     tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
 828     tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
 829
 830     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
 831     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
 832
 833     tmp1            = _mm_xor_ps(signbit, tmp_sin);
 834     tmp2            = _mm_xor_ps(signbit, tmp_cos);
 835
 836     *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
 837     *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
 838
 839     return 0;
 840 }
 841
 842 /*
 843  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 844  * will then call the sincos() routine and waste a factor 2 in performance!
 845  */
 846 static __m128
 847 gmx_mm_sin_ps(__m128 x)
 848 {
 849     __m128 s, c;
 850     gmx_mm_sincos_ps(x, &s, &c);
 851     return s;
 852 }
 853
 854 /*
 855  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 856  * will then call the sincos() routine and waste a factor 2 in performance!
 857  */
 858 static __m128
 859 gmx_mm_cos_ps(__m128 x)
 860 {
 861     __m128 s, c;
 862     gmx_mm_sincos_ps(x, &s, &c);
 863     return c;
 864 }
 865
 866
 867 static __m128
 868 gmx_mm_tan_ps(__m128 x)
 869 {
 870     __m128 sinval, cosval;
 871     __m128 tanval;
 872
 873     gmx_mm_sincos_ps(x, &sinval, &cosval);
 874
 875     tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
 876
 877     return tanval;
 878 }
 879
 880
 881 static __m128
 882 gmx_mm_asin_ps(__m128 x)
 883 {
 884     /* Same algorithm as cephes library */
 885     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 886     const __m128 limitlow  = _mm_set1_ps(1e-4f);
 887     const __m128 half      = _mm_set1_ps(0.5f);
 888     const __m128 one       = _mm_set1_ps(1.0f);
 889     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 890
 891     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 892     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 893     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 894     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 895     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 896
 897     __m128       sign;
 898     __m128       mask;
 899     __m128       xabs;
 900     __m128       z, z1, z2, q, q1, q2;
 901     __m128       pA, pB;
 902
 903     sign  = _mm_andnot_ps(signmask, x);
 904     xabs  = _mm_and_ps(x, signmask);
 905
 906     mask  = _mm_cmp_ps(xabs, half, _CMP_GT_OQ);
 907
 908     z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
 909     q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
 910     q1    = _mm_andnot_ps(_mm_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
 911
 912     q2    = xabs;
 913     z2    = _mm_mul_ps(q2, q2);
 914
 915     z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
 916     q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
 917
 918     z2    = _mm_mul_ps(z, z);
 919
 920     pA    = _mm_macc_ps(CC5, z2, CC3);
 921     pB    = _mm_macc_ps(CC4, z2, CC2);
 922
 923     pA    = _mm_macc_ps(pA, z2, CC1);
 924     pA    = _mm_mul_ps(pA, z);
 925
 926     z     = _mm_macc_ps(pB, z2, pA);
 927
 928     z     = _mm_macc_ps(z, q, q);
 929
 930     q2    = _mm_sub_ps(halfpi, z);
 931     q2    = _mm_sub_ps(q2, z);
 932
 933     z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
 934
 935     mask  = _mm_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
 936     z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
 937
 938     z = _mm_xor_ps(z, sign);
 939
 940     return z;
 941 }
 942
 943
 944 static __m128
 945 gmx_mm_acos_ps(__m128 x)
 946 {
 947     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 948     const __m128 one_ps    = _mm_set1_ps(1.0f);
 949     const __m128 half_ps   = _mm_set1_ps(0.5f);
 950     const __m128 pi_ps     = _mm_set1_ps(M_PI);
 951     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
 952
 953     __m128       mask1;
 954     __m128       mask2;
 955     __m128       xabs;
 956     __m128       z, z1, z2, z3;
 957
 958     xabs  = _mm_and_ps(x, signmask);
 959     mask1 = _mm_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
 960     mask2 = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_GT_OQ);
 961
 962     z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
 963     z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
 964     z     = _mm_andnot_ps(_mm_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
 965
 966     z     = _mm_blendv_ps(x, z, mask1);
 967     z     = gmx_mm_asin_ps(z);
 968
 969     z2    = _mm_add_ps(z, z);
 970     z1    = _mm_sub_ps(pi_ps, z2);
 971     z3    = _mm_sub_ps(halfpi_ps, z);
 972
 973     z     = _mm_blendv_ps(z1, z2, mask2);
 974     z     = _mm_blendv_ps(z3, z, mask1);
 975
 976     return z;
 977 }
 978
 979
 980 static __m128
 981 gmx_mm_atan_ps(__m128 x)
 982 {
 983     /* Same algorithm as cephes library */
 984     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 985     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
 986     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
 987     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
 988     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
 989     const __m128 mone      = _mm_set1_ps(-1.0f);
 990     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
 991     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
 992     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
 993     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
 994
 995     __m128       sign;
 996     __m128       mask1, mask2;
 997     __m128       y, z1, z2;
 998     __m128       x2, x4;
 999     __m128       sum1, sum2;
1000
1001     sign  = _mm_andnot_ps(signmask, x);
1002     x     = _mm_and_ps(x, signmask);
1003
1004     mask1 = _mm_cmp_ps(x, limit1, _CMP_GT_OQ);
1005     mask2 = _mm_cmp_ps(x, limit2, _CMP_GT_OQ);
1006
1007     z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1008     z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1009
1010     y     = _mm_and_ps(mask1, quarterpi);
1011     y     = _mm_blendv_ps(y, halfpi, mask2);
1012
1013     x     = _mm_blendv_ps(x, z1, mask1);
1014     x     = _mm_blendv_ps(x, z2, mask2);
1015
1016     x2    = _mm_mul_ps(x, x);
1017     x4    = _mm_mul_ps(x2, x2);
1018
1019     sum1  = _mm_macc_ps(CC9, x4, CC5);
1020     sum2  = _mm_macc_ps(CC7, x4, CC3);
1021     sum1  = _mm_mul_ps(sum1, x4);
1022     sum1  = _mm_macc_ps(sum2, x2, sum1);
1023
1024     sum1  = _mm_sub_ps(sum1, mone);
1025     y     = _mm_macc_ps(sum1, x, y);
1026
1027     y     = _mm_xor_ps(y, sign);
1028
1029     return y;
1030 }
1031
1032
1033 static __m128
1034 gmx_mm_atan2_ps(__m128 y, __m128 x)
1035 {
1036     const __m128 pi          = _mm_set1_ps(M_PI);
1037     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1038     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1039     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1040
1041     __m128       z, z1, z3, z4;
1042     __m128       w;
1043     __m128       maskx_lt, maskx_eq;
1044     __m128       masky_lt, masky_eq;
1045     __m128       mask1, mask2, mask3, mask4, maskall;
1046
1047     maskx_lt  = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
1048     masky_lt  = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_LT_OQ);
1049     maskx_eq  = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
1050     masky_eq  = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_EQ_OQ);
1051
1052     z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1053     z         = gmx_mm_atan_ps(z);
1054
1055     mask1     = _mm_and_ps(maskx_eq, masky_lt);
1056     mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
1057     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1058     mask4     = _mm_and_ps(masky_eq, maskx_lt);
1059
1060     maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1061
1062     z         = _mm_andnot_ps(maskall, z);
1063     z1        = _mm_and_ps(mask1, minushalfpi);
1064     z3        = _mm_and_ps(mask3, halfpi);
1065     z4        = _mm_and_ps(mask4, pi);
1066
1067     z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1068
1069     mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
1070     mask2     = _mm_and_ps(maskx_lt, masky_lt);
1071
1072     w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1073     w         = _mm_andnot_ps(maskall, w);
1074
1075     z         = _mm_add_ps(z, w);
1076
1077     return z;
1078 }
1079
1080
1081
1082 #endif