src/gromacs/simd/math_x86_sse2_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_MATH_SSE2_SINGLE_H
  36 #define GMX_SIMD_MATH_SSE2_SINGLE_H
  37
  38
  39 #include <stdio.h>
  40 #include <math.h>
  41
  42 #include "general_x86_sse2.h"
  43
  44
  45 #ifndef M_PI
  46 #  define M_PI 3.14159265358979323846264338327950288
  47 #endif
  48
  49
  50
  51 /************************
  52  *                      *
  53  * Simple math routines *
  54  *                      *
  55  ************************/
  56
  57 /* 1.0/sqrt(x) */
  58 static gmx_inline __m128
  59 gmx_mm_invsqrt_ps(__m128 x)
  60 {
  61     const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
  62     const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
  63
  64     __m128       lu = _mm_rsqrt_ps(x);
  65
  66     return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
  67 }
  68
  69 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  70 static gmx_inline __m128
  71 gmx_mm_sqrt_ps(__m128 x)
  72 {
  73     __m128 mask;
  74     __m128 res;
  75
  76     mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
  77     res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
  78
  79     res  = _mm_mul_ps(x, res);
  80
  81     return res;
  82 }
  83
  84 /* 1.0/x */
  85 static gmx_inline __m128
  86 gmx_mm_inv_ps(__m128 x)
  87 {
  88     const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
  89
  90     __m128       lu = _mm_rcp_ps(x);
  91
  92     return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
  93 }
  94
  95 static gmx_inline __m128
  96 gmx_mm_abs_ps(__m128 x)
  97 {
  98     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
  99
 100     return _mm_and_ps(x, signmask);
 101 }
 102
 103
 104 static __m128
 105 gmx_mm_log_ps(__m128 x)
 106 {
 107     /* Same algorithm as cephes library */
 108     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 109     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 110     const __m128  half       = _mm_set1_ps(0.5f);
 111     const __m128  one        = _mm_set1_ps(1.0f);
 112     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 113     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 114     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 115
 116     const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 117     const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 118     const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 119     const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 120     const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 121     const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 122     const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 123     const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 124     const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 125
 126     __m128        fexp;
 127     __m128i       iexp;
 128     __m128        mask;
 129     __m128        x2;
 130     __m128        y;
 131     __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 132
 133     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 134     fexp  = _mm_and_ps(x, expmask);
 135     iexp  = gmx_mm_castps_si128(fexp);
 136     iexp  = _mm_srli_epi32(iexp, 23);
 137     iexp  = _mm_sub_epi32(iexp, expbase_m1);
 138
 139     x     = _mm_andnot_ps(expmask, x);
 140     x     = _mm_or_ps(x, one);
 141     x     = _mm_mul_ps(x, half);
 142
 143     mask  = _mm_cmplt_ps(x, invsq2);
 144
 145     x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 146     x     = _mm_sub_ps(x, one);
 147     iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 148
 149     x2    = _mm_mul_ps(x, x);
 150
 151     pA    = _mm_mul_ps(CA_1, x);
 152     pB    = _mm_mul_ps(CB_1, x);
 153     pC    = _mm_mul_ps(CC_1, x);
 154     pD    = _mm_mul_ps(CD_1, x);
 155     pE    = _mm_mul_ps(CE_1, x);
 156     tB    = _mm_add_ps(CB_0, x2);
 157     tC    = _mm_add_ps(CC_0, x2);
 158     tD    = _mm_add_ps(CD_0, x2);
 159     tE    = _mm_add_ps(CE_0, x2);
 160     pB    = _mm_add_ps(pB, tB);
 161     pC    = _mm_add_ps(pC, tC);
 162     pD    = _mm_add_ps(pD, tD);
 163     pE    = _mm_add_ps(pE, tE);
 164
 165     pA    = _mm_mul_ps(pA, pB);
 166     pC    = _mm_mul_ps(pC, pD);
 167     pE    = _mm_mul_ps(pE, x2);
 168     pA    = _mm_mul_ps(pA, pC);
 169     y     = _mm_mul_ps(pA, pE);
 170
 171     fexp  = _mm_cvtepi32_ps(iexp);
 172     y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
 173
 174     y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
 175     x2    = _mm_add_ps(x, y);
 176
 177     x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
 178
 179     return x2;
 180 }
 181
 182
 183 /*
 184  * 2^x function.
 185  *
 186  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 187  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 188  *
 189  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 190  *
 191  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 192  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 193  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 194  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 195  * number instead. That would take a few extra cycles and not really help, since something is
 196  * wrong if you are using single precision to work with numbers that cannot really be represented
 197  * in single precision.
 198  *
 199  * The accuracy is at least 23 bits.
 200  */
 201 static __m128
 202 gmx_mm_exp2_ps(__m128 x)
 203 {
 204     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 205     const __m128  arglimit = _mm_set1_ps(126.0f);
 206
 207     const __m128i expbase  = _mm_set1_epi32(127);
 208     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 209     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 210     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 211     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 212     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 213     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 214     const __m128  CA0      = _mm_set1_ps(1.0f);
 215
 216
 217     __m128  valuemask;
 218     __m128i iexppart;
 219     __m128  fexppart;
 220     __m128  intpart;
 221     __m128  x2;
 222     __m128  p0, p1;
 223
 224     iexppart  = _mm_cvtps_epi32(x);
 225     intpart   = _mm_cvtepi32_ps(iexppart);
 226     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 227     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
 228     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 229
 230     x         = _mm_sub_ps(x, intpart);
 231     x2        = _mm_mul_ps(x, x);
 232
 233     p0        = _mm_mul_ps(CA6, x2);
 234     p1        = _mm_mul_ps(CA5, x2);
 235     p0        = _mm_add_ps(p0, CA4);
 236     p1        = _mm_add_ps(p1, CA3);
 237     p0        = _mm_mul_ps(p0, x2);
 238     p1        = _mm_mul_ps(p1, x2);
 239     p0        = _mm_add_ps(p0, CA2);
 240     p1        = _mm_add_ps(p1, CA1);
 241     p0        = _mm_mul_ps(p0, x2);
 242     p1        = _mm_mul_ps(p1, x);
 243     p0        = _mm_add_ps(p0, CA0);
 244     p0        = _mm_add_ps(p0, p1);
 245     x         = _mm_mul_ps(p0, fexppart);
 246
 247     return x;
 248 }
 249
 250
 251 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 252  * but there will then be a small rounding error since we lose some precision due to the
 253  * multiplication. This will then be magnified a lot by the exponential.
 254  *
 255  * Instead, we calculate the fractional part directly as a minimax approximation of
 256  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 257  * remaining after 2^y, which avoids the precision-loss.
 258  * The final result is correct to within 1 LSB over the entire argument range.
 259  */
 260 static __m128
 261 gmx_mm_exp_ps(__m128 x)
 262 {
 263     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 264     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 265     const __m128  arglimit      = _mm_set1_ps(126.0f);
 266     const __m128i expbase       = _mm_set1_epi32(127);
 267
 268     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 269     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 270
 271     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 272     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 273     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 274     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 275     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 276     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 277     const __m128  one           = _mm_set1_ps(1.0f);
 278
 279     __m128        y, x2;
 280     __m128        p0, p1;
 281     __m128        valuemask;
 282     __m128i       iexppart;
 283     __m128        fexppart;
 284     __m128        intpart;
 285
 286     y = _mm_mul_ps(x, argscale);
 287
 288     iexppart  = _mm_cvtps_epi32(y);
 289     intpart   = _mm_cvtepi32_ps(iexppart);
 290
 291     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 292     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
 293     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 294
 295     /* Extended precision arithmetics */
 296     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
 297     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
 298
 299     x2        = _mm_mul_ps(x, x);
 300
 301     p1        = _mm_mul_ps(CC5, x2);
 302     p0        = _mm_mul_ps(CC4, x2);
 303     p1        = _mm_add_ps(p1, CC3);
 304     p0        = _mm_add_ps(p0, CC2);
 305     p1        = _mm_mul_ps(p1, x2);
 306     p0        = _mm_mul_ps(p0, x2);
 307     p1        = _mm_add_ps(p1, CC1);
 308     p0        = _mm_add_ps(p0, CC0);
 309     p1        = _mm_mul_ps(p1, x);
 310     p0        = _mm_add_ps(p0, p1);
 311     p0        = _mm_mul_ps(p0, x2);
 312     x         = _mm_add_ps(x, one);
 313     x         = _mm_add_ps(x, p0);
 314
 315     x         = _mm_mul_ps(x, fexppart);
 316
 317     return x;
 318 }
 319
 320 /* FULL precision. Only errors in LSB */
 321 static __m128
 322 gmx_mm_erf_ps(__m128 x)
 323 {
 324     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 325     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 326     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 327     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 328     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 329     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 330     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 331     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 332     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 333     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 334     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 335     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 336     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 337     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 338     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 339     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 340     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 341     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 342     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 343     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 344     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 345     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 346     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 347     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 348     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 349     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 350     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 351     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 352     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 353     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 354     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 355
 356     /* Coefficients for expansion of exp(x) in [0,0.1] */
 357     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 358     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 359     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 360     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 361
 362     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 363     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 364     const __m128  one      = _mm_set1_ps(1.0f);
 365     const __m128  two      = _mm_set1_ps(2.0f);
 366
 367     __m128        x2, x4, y;
 368     __m128        z, q, t, t2, w, w2;
 369     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 370     __m128        expmx2, corr;
 371     __m128        res_erf, res_erfc, res;
 372     __m128        mask;
 373
 374     /* Calculate erf() */
 375     x2     = _mm_mul_ps(x, x);
 376     x4     = _mm_mul_ps(x2, x2);
 377
 378     pA0  = _mm_mul_ps(CA6, x4);
 379     pA1  = _mm_mul_ps(CA5, x4);
 380     pA0  = _mm_add_ps(pA0, CA4);
 381     pA1  = _mm_add_ps(pA1, CA3);
 382     pA0  = _mm_mul_ps(pA0, x4);
 383     pA1  = _mm_mul_ps(pA1, x4);
 384     pA0  = _mm_add_ps(pA0, CA2);
 385     pA1  = _mm_add_ps(pA1, CA1);
 386     pA0  = _mm_mul_ps(pA0, x4);
 387     pA1  = _mm_mul_ps(pA1, x2);
 388     pA0  = _mm_add_ps(pA0, pA1);
 389     pA0  = _mm_add_ps(pA0, CA0);
 390
 391     res_erf = _mm_mul_ps(x, pA0);
 392
 393     /* Calculate erfc */
 394
 395     y       = gmx_mm_abs_ps(x);
 396     t       = gmx_mm_inv_ps(y);
 397     w       = _mm_sub_ps(t, one);
 398     t2      = _mm_mul_ps(t, t);
 399     w2      = _mm_mul_ps(w, w);
 400     /*
 401      * We cannot simply calculate exp(-x2) directly in single precision, since
 402      * that will lose a couple of bits of precision due to the multiplication.
 403      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 404      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 405      *
 406      * The only drawback with this is that it requires TWO separate exponential
 407      * evaluations, which would be horrible performance-wise. However, the argument
 408      * for the second exp() call is always small, so there we simply use a
 409      * low-order minimax expansion on [0,0.1].
 410      */
 411
 412     z       = _mm_and_ps(y, sieve);
 413     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 414
 415     corr    = _mm_mul_ps(CD4, q);
 416     corr    = _mm_add_ps(corr, CD3);
 417     corr    = _mm_mul_ps(corr, q);
 418     corr    = _mm_add_ps(corr, CD2);
 419     corr    = _mm_mul_ps(corr, q);
 420     corr    = _mm_add_ps(corr, one);
 421     corr    = _mm_mul_ps(corr, q);
 422     corr    = _mm_add_ps(corr, one);
 423
 424     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 425     expmx2  = _mm_mul_ps(expmx2, corr);
 426
 427     pB1  = _mm_mul_ps(CB9, w2);
 428     pB0  = _mm_mul_ps(CB8, w2);
 429     pB1  = _mm_add_ps(pB1, CB7);
 430     pB0  = _mm_add_ps(pB0, CB6);
 431     pB1  = _mm_mul_ps(pB1, w2);
 432     pB0  = _mm_mul_ps(pB0, w2);
 433     pB1  = _mm_add_ps(pB1, CB5);
 434     pB0  = _mm_add_ps(pB0, CB4);
 435     pB1  = _mm_mul_ps(pB1, w2);
 436     pB0  = _mm_mul_ps(pB0, w2);
 437     pB1  = _mm_add_ps(pB1, CB3);
 438     pB0  = _mm_add_ps(pB0, CB2);
 439     pB1  = _mm_mul_ps(pB1, w2);
 440     pB0  = _mm_mul_ps(pB0, w2);
 441     pB1  = _mm_add_ps(pB1, CB1);
 442     pB1  = _mm_mul_ps(pB1, w);
 443     pB0  = _mm_add_ps(pB0, pB1);
 444     pB0  = _mm_add_ps(pB0, CB0);
 445
 446     pC0  = _mm_mul_ps(CC10, t2);
 447     pC1  = _mm_mul_ps(CC9, t2);
 448     pC0  = _mm_add_ps(pC0, CC8);
 449     pC1  = _mm_add_ps(pC1, CC7);
 450     pC0  = _mm_mul_ps(pC0, t2);
 451     pC1  = _mm_mul_ps(pC1, t2);
 452     pC0  = _mm_add_ps(pC0, CC6);
 453     pC1  = _mm_add_ps(pC1, CC5);
 454     pC0  = _mm_mul_ps(pC0, t2);
 455     pC1  = _mm_mul_ps(pC1, t2);
 456     pC0  = _mm_add_ps(pC0, CC4);
 457     pC1  = _mm_add_ps(pC1, CC3);
 458     pC0  = _mm_mul_ps(pC0, t2);
 459     pC1  = _mm_mul_ps(pC1, t2);
 460     pC0  = _mm_add_ps(pC0, CC2);
 461     pC1  = _mm_add_ps(pC1, CC1);
 462     pC0  = _mm_mul_ps(pC0, t2);
 463     pC1  = _mm_mul_ps(pC1, t);
 464     pC0  = _mm_add_ps(pC0, pC1);
 465     pC0  = _mm_add_ps(pC0, CC0);
 466     pC0  = _mm_mul_ps(pC0, t);
 467
 468     /* SELECT pB0 or pC0 for erfc() */
 469     mask     = _mm_cmplt_ps(two, y);
 470     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
 471     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 472
 473     /* erfc(x<0) = 2-erfc(|x|) */
 474     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 475     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
 476                          _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
 477
 478     /* Select erf() or erfc() */
 479     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 480     res  = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
 481
 482     return res;
 483 }
 484
 485
 486
 487
 488
 489 /* FULL precision. Only errors in LSB */
 490 static __m128
 491 gmx_mm_erfc_ps(__m128 x)
 492 {
 493     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 494     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 495     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 496     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 497     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 498     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 499     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 500     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 501     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 502     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 503     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 504     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 505     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 506     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 507     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 508     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 509     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 510     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 511     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 512     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 513     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 514     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 515     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 516     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 517     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 518     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 519     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 520     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 521     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 522     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 523     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 524
 525     /* Coefficients for expansion of exp(x) in [0,0.1] */
 526     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 527     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 528     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 529     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 530
 531     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 532     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 533     const __m128  one      = _mm_set1_ps(1.0f);
 534     const __m128  two      = _mm_set1_ps(2.0f);
 535
 536     __m128        x2, x4, y;
 537     __m128        z, q, t, t2, w, w2;
 538     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 539     __m128        expmx2, corr;
 540     __m128        res_erf, res_erfc, res;
 541     __m128        mask;
 542
 543     /* Calculate erf() */
 544     x2     = _mm_mul_ps(x, x);
 545     x4     = _mm_mul_ps(x2, x2);
 546
 547     pA0  = _mm_mul_ps(CA6, x4);
 548     pA1  = _mm_mul_ps(CA5, x4);
 549     pA0  = _mm_add_ps(pA0, CA4);
 550     pA1  = _mm_add_ps(pA1, CA3);
 551     pA0  = _mm_mul_ps(pA0, x4);
 552     pA1  = _mm_mul_ps(pA1, x4);
 553     pA0  = _mm_add_ps(pA0, CA2);
 554     pA1  = _mm_add_ps(pA1, CA1);
 555     pA0  = _mm_mul_ps(pA0, x4);
 556     pA1  = _mm_mul_ps(pA1, x2);
 557     pA0  = _mm_add_ps(pA0, pA1);
 558     pA0  = _mm_add_ps(pA0, CA0);
 559
 560     res_erf = _mm_mul_ps(x, pA0);
 561
 562     /* Calculate erfc */
 563     y       = gmx_mm_abs_ps(x);
 564     t       = gmx_mm_inv_ps(y);
 565     w       = _mm_sub_ps(t, one);
 566     t2      = _mm_mul_ps(t, t);
 567     w2      = _mm_mul_ps(w, w);
 568     /*
 569      * We cannot simply calculate exp(-x2) directly in single precision, since
 570      * that will lose a couple of bits of precision due to the multiplication.
 571      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 572      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 573      *
 574      * The only drawback with this is that it requires TWO separate exponential
 575      * evaluations, which would be horrible performance-wise. However, the argument
 576      * for the second exp() call is always small, so there we simply use a
 577      * low-order minimax expansion on [0,0.1].
 578      */
 579
 580     z       = _mm_and_ps(y, sieve);
 581     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 582
 583     corr    = _mm_mul_ps(CD4, q);
 584     corr    = _mm_add_ps(corr, CD3);
 585     corr    = _mm_mul_ps(corr, q);
 586     corr    = _mm_add_ps(corr, CD2);
 587     corr    = _mm_mul_ps(corr, q);
 588     corr    = _mm_add_ps(corr, one);
 589     corr    = _mm_mul_ps(corr, q);
 590     corr    = _mm_add_ps(corr, one);
 591
 592     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 593     expmx2  = _mm_mul_ps(expmx2, corr);
 594
 595     pB1  = _mm_mul_ps(CB9, w2);
 596     pB0  = _mm_mul_ps(CB8, w2);
 597     pB1  = _mm_add_ps(pB1, CB7);
 598     pB0  = _mm_add_ps(pB0, CB6);
 599     pB1  = _mm_mul_ps(pB1, w2);
 600     pB0  = _mm_mul_ps(pB0, w2);
 601     pB1  = _mm_add_ps(pB1, CB5);
 602     pB0  = _mm_add_ps(pB0, CB4);
 603     pB1  = _mm_mul_ps(pB1, w2);
 604     pB0  = _mm_mul_ps(pB0, w2);
 605     pB1  = _mm_add_ps(pB1, CB3);
 606     pB0  = _mm_add_ps(pB0, CB2);
 607     pB1  = _mm_mul_ps(pB1, w2);
 608     pB0  = _mm_mul_ps(pB0, w2);
 609     pB1  = _mm_add_ps(pB1, CB1);
 610     pB1  = _mm_mul_ps(pB1, w);
 611     pB0  = _mm_add_ps(pB0, pB1);
 612     pB0  = _mm_add_ps(pB0, CB0);
 613
 614     pC0  = _mm_mul_ps(CC10, t2);
 615     pC1  = _mm_mul_ps(CC9, t2);
 616     pC0  = _mm_add_ps(pC0, CC8);
 617     pC1  = _mm_add_ps(pC1, CC7);
 618     pC0  = _mm_mul_ps(pC0, t2);
 619     pC1  = _mm_mul_ps(pC1, t2);
 620     pC0  = _mm_add_ps(pC0, CC6);
 621     pC1  = _mm_add_ps(pC1, CC5);
 622     pC0  = _mm_mul_ps(pC0, t2);
 623     pC1  = _mm_mul_ps(pC1, t2);
 624     pC0  = _mm_add_ps(pC0, CC4);
 625     pC1  = _mm_add_ps(pC1, CC3);
 626     pC0  = _mm_mul_ps(pC0, t2);
 627     pC1  = _mm_mul_ps(pC1, t2);
 628     pC0  = _mm_add_ps(pC0, CC2);
 629     pC1  = _mm_add_ps(pC1, CC1);
 630     pC0  = _mm_mul_ps(pC0, t2);
 631     pC1  = _mm_mul_ps(pC1, t);
 632     pC0  = _mm_add_ps(pC0, pC1);
 633     pC0  = _mm_add_ps(pC0, CC0);
 634     pC0  = _mm_mul_ps(pC0, t);
 635
 636     /* SELECT pB0 or pC0 for erfc() */
 637     mask     = _mm_cmplt_ps(two, y);
 638     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
 639     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 640
 641     /* erfc(x<0) = 2-erfc(|x|) */
 642     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 643     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
 644
 645     /* Select erf() or erfc() */
 646     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 647     res  = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
 648
 649     return res;
 650 }
 651
 652
 653 /* Calculate the force correction due to PME analytically.
 654  *
 655  * This routine is meant to enable analytical evaluation of the
 656  * direct-space PME electrostatic force to avoid tables.
 657  *
 658  * The direct-space potential should be Erfc(beta*r)/r, but there
 659  * are some problems evaluating that:
 660  *
 661  * First, the error function is difficult (read: expensive) to
 662  * approxmiate accurately for intermediate to large arguments, and
 663  * this happens already in ranges of beta*r that occur in simulations.
 664  * Second, we now try to avoid calculating potentials in Gromacs but
 665  * use forces directly.
 666  *
 667  * We can simply things slight by noting that the PME part is really
 668  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 669  *
 670  * V= 1/r - Erf(beta*r)/r
 671  *
 672  * The first term we already have from the inverse square root, so
 673  * that we can leave out of this routine.
 674  *
 675  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 676  * the argument beta*r will be in the range 0.15 to ~4. Use your
 677  * favorite plotting program to realize how well-behaved Erf(z)/z is
 678  * in this range!
 679  *
 680  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 681  * However, it turns out it is more efficient to approximate f(z)/z and
 682  * then only use even powers. This is another minor optimization, since
 683  * we actually WANT f(z)/z, because it is going to be multiplied by
 684  * the vector between the two atoms to get the vectorial force. The
 685  * fastest flops are the ones we can avoid calculating!
 686  *
 687  * So, here's how it should be used:
 688  *
 689  * 1. Calculate r^2.
 690  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 691  * 3. Evaluate this routine with z^2 as the argument.
 692  * 4. The return value is the expression:
 693  *
 694  *
 695  *       2*exp(-z^2)     erf(z)
 696  *       ------------ - --------
 697  *       sqrt(Pi)*z^2      z^3
 698  *
 699  * 5. Multiply the entire expression by beta^3. This will get you
 700  *
 701  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 702  *       ------------------  - ---------------
 703  *          sqrt(Pi)*z^2            z^3
 704  *
 705  *    or, switching back to r (z=r*beta):
 706  *
 707  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 708  *       ----------------------- - -----------
 709  *            sqrt(Pi)*r^2            r^3
 710  *
 711  *
 712  *    With a bit of math exercise you should be able to confirm that
 713  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 714  *
 715  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 716  *    and you have your force (divided by r). A final multiplication
 717  *    with the vector connecting the two particles and you have your
 718  *    vectorial force to add to the particles.
 719  *
 720  */
 721 static gmx_inline __m128
 722 gmx_mm_pmecorrF_ps(__m128 z2)
 723 {
 724     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 725     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 726     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 727     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 728     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 729     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 730     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 731
 732     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 733     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 734     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 735     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 736     const __m128  FD0      = _mm_set1_ps(1.0f);
 737
 738     __m128        z4;
 739     __m128        polyFN0, polyFN1, polyFD0, polyFD1;
 740
 741     z4             = _mm_mul_ps(z2, z2);
 742
 743     polyFD0        = _mm_mul_ps(FD4, z4);
 744     polyFD1        = _mm_mul_ps(FD3, z4);
 745     polyFD0        = _mm_add_ps(polyFD0, FD2);
 746     polyFD1        = _mm_add_ps(polyFD1, FD1);
 747     polyFD0        = _mm_mul_ps(polyFD0, z4);
 748     polyFD1        = _mm_mul_ps(polyFD1, z2);
 749     polyFD0        = _mm_add_ps(polyFD0, FD0);
 750     polyFD0        = _mm_add_ps(polyFD0, polyFD1);
 751
 752     polyFD0        = gmx_mm_inv_ps(polyFD0);
 753
 754     polyFN0        = _mm_mul_ps(FN6, z4);
 755     polyFN1        = _mm_mul_ps(FN5, z4);
 756     polyFN0        = _mm_add_ps(polyFN0, FN4);
 757     polyFN1        = _mm_add_ps(polyFN1, FN3);
 758     polyFN0        = _mm_mul_ps(polyFN0, z4);
 759     polyFN1        = _mm_mul_ps(polyFN1, z4);
 760     polyFN0        = _mm_add_ps(polyFN0, FN2);
 761     polyFN1        = _mm_add_ps(polyFN1, FN1);
 762     polyFN0        = _mm_mul_ps(polyFN0, z4);
 763     polyFN1        = _mm_mul_ps(polyFN1, z2);
 764     polyFN0        = _mm_add_ps(polyFN0, FN0);
 765     polyFN0        = _mm_add_ps(polyFN0, polyFN1);
 766
 767     return _mm_mul_ps(polyFN0, polyFD0);
 768 }
 769
 770
 771 /* Calculate the potential correction due to PME analytically.
 772  *
 773  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 774  *
 775  * This routine calculates Erf(z)/z, although you should provide z^2
 776  * as the input argument.
 777  *
 778  * Here's how it should be used:
 779  *
 780  * 1. Calculate r^2.
 781  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 782  * 3. Evaluate this routine with z^2 as the argument.
 783  * 4. The return value is the expression:
 784  *
 785  *
 786  *        erf(z)
 787  *       --------
 788  *          z
 789  *
 790  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 791  *
 792  *       erf(r*beta)
 793  *       -----------
 794  *           r
 795  *
 796  * 6. Subtract the result from 1/r, multiply by the product of the charges,
 797  *    and you have your potential.
 798  */
 799 static gmx_inline __m128
 800 gmx_mm_pmecorrV_ps(__m128 z2)
 801 {
 802     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 803     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 804     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 805     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 806     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 807     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 808     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 809
 810     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 811     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 812     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 813     const __m128  VD0      = _mm_set1_ps(1.0f);
 814
 815     __m128        z4;
 816     __m128        polyVN0, polyVN1, polyVD0, polyVD1;
 817
 818     z4             = _mm_mul_ps(z2, z2);
 819
 820     polyVD1        = _mm_mul_ps(VD3, z4);
 821     polyVD0        = _mm_mul_ps(VD2, z4);
 822     polyVD1        = _mm_add_ps(polyVD1, VD1);
 823     polyVD0        = _mm_add_ps(polyVD0, VD0);
 824     polyVD1        = _mm_mul_ps(polyVD1, z2);
 825     polyVD0        = _mm_add_ps(polyVD0, polyVD1);
 826
 827     polyVD0        = gmx_mm_inv_ps(polyVD0);
 828
 829     polyVN0        = _mm_mul_ps(VN6, z4);
 830     polyVN1        = _mm_mul_ps(VN5, z4);
 831     polyVN0        = _mm_add_ps(polyVN0, VN4);
 832     polyVN1        = _mm_add_ps(polyVN1, VN3);
 833     polyVN0        = _mm_mul_ps(polyVN0, z4);
 834     polyVN1        = _mm_mul_ps(polyVN1, z4);
 835     polyVN0        = _mm_add_ps(polyVN0, VN2);
 836     polyVN1        = _mm_add_ps(polyVN1, VN1);
 837     polyVN0        = _mm_mul_ps(polyVN0, z4);
 838     polyVN1        = _mm_mul_ps(polyVN1, z2);
 839     polyVN0        = _mm_add_ps(polyVN0, VN0);
 840     polyVN0        = _mm_add_ps(polyVN0, polyVN1);
 841
 842     return _mm_mul_ps(polyVN0, polyVD0);
 843 }
 844
 845
 846 static int
 847 gmx_mm_sincos_ps(__m128  x,
 848                  __m128 *sinval,
 849                  __m128 *cosval)
 850 {
 851     const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
 852     const __m128  half        = _mm_set1_ps(0.5);
 853     const __m128  one         = _mm_set1_ps(1.0);
 854
 855     const __m128i izero      = _mm_set1_epi32(0);
 856     const __m128i ione       = _mm_set1_epi32(1);
 857     const __m128i itwo       = _mm_set1_epi32(2);
 858     const __m128i ithree     = _mm_set1_epi32(3);
 859     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 860
 861     const __m128  CA1         = _mm_set1_ps(1.5703125f);
 862     const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 863     const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 864
 865     const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
 866     const __m128  CC1         = _mm_set1_ps(0.0416566950f);
 867     const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
 868     const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
 869     const __m128  CS1         = _mm_set1_ps(0.0083320758f);
 870     const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
 871
 872     __m128        y, y2;
 873     __m128        z;
 874     __m128i       iz;
 875     __m128i       offset_sin, offset_cos;
 876     __m128        tmp1, tmp2;
 877     __m128        mask_sin, mask_cos;
 878     __m128        tmp_sin, tmp_cos;
 879
 880     y          = _mm_mul_ps(x, two_over_pi);
 881     y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
 882
 883     iz         = _mm_cvttps_epi32(y);
 884     z          = _mm_cvtepi32_ps(iz);
 885
 886     offset_sin = _mm_and_si128(iz, ithree);
 887     offset_cos = _mm_add_epi32(iz, ione);
 888
 889     /* Extended precision arithmethic to achieve full precision */
 890     y               = _mm_mul_ps(z, CA1);
 891     tmp1            = _mm_mul_ps(z, CA2);
 892     tmp2            = _mm_mul_ps(z, CA3);
 893     y               = _mm_sub_ps(x, y);
 894     y               = _mm_sub_ps(y, tmp1);
 895     y               = _mm_sub_ps(y, tmp2);
 896
 897     y2              = _mm_mul_ps(y, y);
 898
 899     tmp1            = _mm_mul_ps(CC0, y2);
 900     tmp1            = _mm_add_ps(tmp1, CC1);
 901     tmp2            = _mm_mul_ps(CS0, y2);
 902     tmp2            = _mm_add_ps(tmp2, CS1);
 903     tmp1            = _mm_mul_ps(tmp1, y2);
 904     tmp1            = _mm_add_ps(tmp1, CC2);
 905     tmp2            = _mm_mul_ps(tmp2, y2);
 906     tmp2            = _mm_add_ps(tmp2, CS2);
 907
 908     tmp1            = _mm_mul_ps(tmp1, y2);
 909     tmp1            = _mm_add_ps(tmp1, one);
 910
 911     tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
 912     tmp2            = _mm_add_ps(tmp2, y);
 913
 914     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
 915     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
 916
 917     tmp_sin         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
 918     tmp_cos         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
 919
 920     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
 921     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
 922
 923     tmp1            = _mm_xor_ps(signbit, tmp_sin);
 924     tmp2            = _mm_xor_ps(signbit, tmp_cos);
 925
 926     *sinval         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
 927     *cosval         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
 928
 929     return 0;
 930 }
 931
 932 /*
 933  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 934  * will then call the sincos() routine and waste a factor 2 in performance!
 935  */
 936 static __m128
 937 gmx_mm_sin_ps(__m128 x)
 938 {
 939     __m128 s, c;
 940     gmx_mm_sincos_ps(x, &s, &c);
 941     return s;
 942 }
 943
 944 /*
 945  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 946  * will then call the sincos() routine and waste a factor 2 in performance!
 947  */
 948 static __m128
 949 gmx_mm_cos_ps(__m128 x)
 950 {
 951     __m128 s, c;
 952     gmx_mm_sincos_ps(x, &s, &c);
 953     return c;
 954 }
 955
 956
 957 static __m128
 958 gmx_mm_tan_ps(__m128 x)
 959 {
 960     __m128 sinval, cosval;
 961     __m128 tanval;
 962
 963     gmx_mm_sincos_ps(x, &sinval, &cosval);
 964
 965     tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
 966
 967     return tanval;
 968 }
 969
 970
 971 static __m128
 972 gmx_mm_asin_ps(__m128 x)
 973 {
 974     /* Same algorithm as cephes library */
 975     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 976     const __m128 limitlow  = _mm_set1_ps(1e-4f);
 977     const __m128 half      = _mm_set1_ps(0.5f);
 978     const __m128 one       = _mm_set1_ps(1.0f);
 979     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 980
 981     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 982     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 983     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 984     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 985     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 986
 987     __m128       sign;
 988     __m128       mask;
 989     __m128       xabs;
 990     __m128       z, z1, z2, q, q1, q2;
 991     __m128       pA, pB;
 992
 993     sign  = _mm_andnot_ps(signmask, x);
 994     xabs  = _mm_and_ps(x, signmask);
 995
 996     mask  = _mm_cmpgt_ps(xabs, half);
 997
 998     z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
 999     q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1000     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1001
1002     q2    = xabs;
1003     z2    = _mm_mul_ps(q2, q2);
1004
1005     z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1006     q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1007
1008     z2    = _mm_mul_ps(z, z);
1009
1010     pA    = _mm_mul_ps(CC5, z2);
1011     pB    = _mm_mul_ps(CC4, z2);
1012
1013     pA    = _mm_add_ps(pA, CC3);
1014     pB    = _mm_add_ps(pB, CC2);
1015
1016     pA    = _mm_mul_ps(pA, z2);
1017     pB    = _mm_mul_ps(pB, z2);
1018
1019     pA    = _mm_add_ps(pA, CC1);
1020     pA    = _mm_mul_ps(pA, z);
1021
1022     z     = _mm_add_ps(pA, pB);
1023     z     = _mm_mul_ps(z, q);
1024     z     = _mm_add_ps(z, q);
1025
1026     q2    = _mm_sub_ps(halfpi, z);
1027     q2    = _mm_sub_ps(q2, z);
1028
1029     z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1030
1031     mask  = _mm_cmpgt_ps(xabs, limitlow);
1032     z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1033
1034     z = _mm_xor_ps(z, sign);
1035
1036     return z;
1037 }
1038
1039
1040 static __m128
1041 gmx_mm_acos_ps(__m128 x)
1042 {
1043     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1044     const __m128 one_ps    = _mm_set1_ps(1.0f);
1045     const __m128 half_ps   = _mm_set1_ps(0.5f);
1046     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1047     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1048
1049     __m128       mask1;
1050     __m128       mask2;
1051     __m128       xabs;
1052     __m128       z, z1, z2, z3;
1053
1054     xabs  = _mm_and_ps(x, signmask);
1055     mask1 = _mm_cmpgt_ps(xabs, half_ps);
1056     mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1057
1058     z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1059     z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1060     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1061
1062     z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
1063     z     = gmx_mm_asin_ps(z);
1064
1065     z2    = _mm_add_ps(z, z);
1066     z1    = _mm_sub_ps(pi_ps, z2);
1067     z3    = _mm_sub_ps(halfpi_ps, z);
1068
1069     z     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
1070     z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
1071
1072     return z;
1073 }
1074
1075
1076 static __m128
1077 gmx_mm_atan_ps(__m128 x)
1078 {
1079     /* Same algorithm as cephes library */
1080     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1081     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
1082     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
1083     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1084     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
1085     const __m128 mone      = _mm_set1_ps(-1.0f);
1086     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
1087     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
1088     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
1089     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
1090
1091     __m128       sign;
1092     __m128       mask1, mask2;
1093     __m128       y, z1, z2;
1094     __m128       x2, x4;
1095     __m128       sum1, sum2;
1096
1097     sign  = _mm_andnot_ps(signmask, x);
1098     x     = _mm_and_ps(x, signmask);
1099
1100     mask1 = _mm_cmpgt_ps(x, limit1);
1101     mask2 = _mm_cmpgt_ps(x, limit2);
1102
1103     z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1104     z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1105
1106     y     = _mm_and_ps(mask1, quarterpi);
1107     y     = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
1108
1109     x     = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
1110     x     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
1111
1112     x2    = _mm_mul_ps(x, x);
1113     x4    = _mm_mul_ps(x2, x2);
1114
1115     sum1  = _mm_mul_ps(CC9, x4);
1116     sum2  = _mm_mul_ps(CC7, x4);
1117     sum1  = _mm_add_ps(sum1, CC5);
1118     sum2  = _mm_add_ps(sum2, CC3);
1119     sum1  = _mm_mul_ps(sum1, x4);
1120     sum2  = _mm_mul_ps(sum2, x2);
1121
1122     sum1  = _mm_add_ps(sum1, sum2);
1123     sum1  = _mm_sub_ps(sum1, mone);
1124     sum1  = _mm_mul_ps(sum1, x);
1125     y     = _mm_add_ps(y, sum1);
1126
1127     y     = _mm_xor_ps(y, sign);
1128
1129     return y;
1130 }
1131
1132
1133 static __m128
1134 gmx_mm_atan2_ps(__m128 y, __m128 x)
1135 {
1136     const __m128 pi          = _mm_set1_ps(M_PI);
1137     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1138     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1139     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1140
1141     __m128       z, z1, z3, z4;
1142     __m128       w;
1143     __m128       maskx_lt, maskx_eq;
1144     __m128       masky_lt, masky_eq;
1145     __m128       mask1, mask2, mask3, mask4, maskall;
1146
1147     maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
1148     masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
1149     maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
1150     masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
1151
1152     z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1153     z         = gmx_mm_atan_ps(z);
1154
1155     mask1     = _mm_and_ps(maskx_eq, masky_lt);
1156     mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
1157     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1158     mask4     = _mm_and_ps(masky_eq, maskx_lt);
1159
1160     maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1161
1162     z         = _mm_andnot_ps(maskall, z);
1163     z1        = _mm_and_ps(mask1, minushalfpi);
1164     z3        = _mm_and_ps(mask3, halfpi);
1165     z4        = _mm_and_ps(mask4, pi);
1166
1167     z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1168
1169     mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
1170     mask2     = _mm_and_ps(maskx_lt, masky_lt);
1171
1172     w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1173     w         = _mm_andnot_ps(maskall, w);
1174
1175     z         = _mm_add_ps(z, w);
1176
1177     return z;
1178 }
1179
1180
1181 #endif