src/gromacs/legacyheaders/gmx_math_x86_sse2_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_math_x86_sse2_single_h_
  22 #define _gmx_math_x86_sse2_single_h_
  23
  24
  25 #include <stdio.h>
  26 #include <math.h>
  27
  28 #include "gmx_x86_sse2.h"
  29
  30
  31 #ifndef M_PI
  32 #  define M_PI 3.14159265358979323846264338327950288
  33 #endif
  34
  35
  36
  37 /************************
  38  *                      *
  39  * Simple math routines *
  40  *                      *
  41  ************************/
  42
  43 /* 1.0/sqrt(x) */
  44 static gmx_inline __m128
  45 gmx_mm_invsqrt_ps(__m128 x)
  46 {
  47     const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
  48     const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
  49
  50     __m128       lu = _mm_rsqrt_ps(x);
  51
  52     return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
  53 }
  54
  55 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  56 static gmx_inline __m128
  57 gmx_mm_sqrt_ps(__m128 x)
  58 {
  59     __m128 mask;
  60     __m128 res;
  61
  62     mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
  63     res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
  64
  65     res  = _mm_mul_ps(x, res);
  66
  67     return res;
  68 }
  69
  70 /* 1.0/x */
  71 static gmx_inline __m128
  72 gmx_mm_inv_ps(__m128 x)
  73 {
  74     const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
  75
  76     __m128       lu = _mm_rcp_ps(x);
  77
  78     return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
  79 }
  80
  81 static gmx_inline __m128
  82 gmx_mm_abs_ps(__m128 x)
  83 {
  84     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
  85
  86     return _mm_and_ps(x, signmask);
  87 }
  88
  89
  90 static __m128
  91 gmx_mm_log_ps(__m128 x)
  92 {
  93     /* Same algorithm as cephes library */
  94     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
  95     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
  96     const __m128  half       = _mm_set1_ps(0.5f);
  97     const __m128  one        = _mm_set1_ps(1.0f);
  98     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
  99     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 100     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 101
 102     const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 103     const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 104     const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 105     const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 106     const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 107     const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 108     const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 109     const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 110     const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 111
 112     __m128        fexp, fexp1;
 113     __m128i       iexp;
 114     __m128        mask;
 115     __m128        x1, x2;
 116     __m128        y;
 117     __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 118
 119     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 120     fexp  = _mm_and_ps(x, expmask);
 121     iexp  = gmx_mm_castps_si128(fexp);
 122     iexp  = _mm_srli_epi32(iexp, 23);
 123     iexp  = _mm_sub_epi32(iexp, expbase_m1);
 124
 125     x     = _mm_andnot_ps(expmask, x);
 126     x     = _mm_or_ps(x, one);
 127     x     = _mm_mul_ps(x, half);
 128
 129     mask  = _mm_cmplt_ps(x, invsq2);
 130
 131     x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 132     x     = _mm_sub_ps(x, one);
 133     iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 134
 135     x2    = _mm_mul_ps(x, x);
 136
 137     pA    = _mm_mul_ps(CA_1, x);
 138     pB    = _mm_mul_ps(CB_1, x);
 139     pC    = _mm_mul_ps(CC_1, x);
 140     pD    = _mm_mul_ps(CD_1, x);
 141     pE    = _mm_mul_ps(CE_1, x);
 142     tB    = _mm_add_ps(CB_0, x2);
 143     tC    = _mm_add_ps(CC_0, x2);
 144     tD    = _mm_add_ps(CD_0, x2);
 145     tE    = _mm_add_ps(CE_0, x2);
 146     pB    = _mm_add_ps(pB, tB);
 147     pC    = _mm_add_ps(pC, tC);
 148     pD    = _mm_add_ps(pD, tD);
 149     pE    = _mm_add_ps(pE, tE);
 150
 151     pA    = _mm_mul_ps(pA, pB);
 152     pC    = _mm_mul_ps(pC, pD);
 153     pE    = _mm_mul_ps(pE, x2);
 154     pA    = _mm_mul_ps(pA, pC);
 155     y     = _mm_mul_ps(pA, pE);
 156
 157     fexp  = _mm_cvtepi32_ps(iexp);
 158     y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
 159
 160     y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
 161     x2    = _mm_add_ps(x, y);
 162
 163     x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
 164
 165     return x2;
 166 }
 167
 168
 169 /*
 170  * 2^x function.
 171  *
 172  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 173  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 174  *
 175  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 176  *
 177  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 178  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 179  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 180  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 181  * number instead. That would take a few extra cycles and not really help, since something is
 182  * wrong if you are using single precision to work with numbers that cannot really be represented
 183  * in single precision.
 184  *
 185  * The accuracy is at least 23 bits.
 186  */
 187 static __m128
 188 gmx_mm_exp2_ps(__m128 x)
 189 {
 190     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 191     const __m128  arglimit = _mm_set1_ps(126.0f);
 192
 193     const __m128i expbase  = _mm_set1_epi32(127);
 194     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 195     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 196     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 197     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 198     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 199     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 200     const __m128  CA0      = _mm_set1_ps(1.0f);
 201
 202
 203     __m128  valuemask;
 204     __m128i iexppart;
 205     __m128  fexppart;
 206     __m128  intpart;
 207     __m128  x2;
 208     __m128  p0, p1;
 209
 210     iexppart  = _mm_cvtps_epi32(x);
 211     intpart   = _mm_cvtepi32_ps(iexppart);
 212     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 213     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
 214     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 215
 216     x         = _mm_sub_ps(x, intpart);
 217     x2        = _mm_mul_ps(x, x);
 218
 219     p0        = _mm_mul_ps(CA6, x2);
 220     p1        = _mm_mul_ps(CA5, x2);
 221     p0        = _mm_add_ps(p0, CA4);
 222     p1        = _mm_add_ps(p1, CA3);
 223     p0        = _mm_mul_ps(p0, x2);
 224     p1        = _mm_mul_ps(p1, x2);
 225     p0        = _mm_add_ps(p0, CA2);
 226     p1        = _mm_add_ps(p1, CA1);
 227     p0        = _mm_mul_ps(p0, x2);
 228     p1        = _mm_mul_ps(p1, x);
 229     p0        = _mm_add_ps(p0, CA0);
 230     p0        = _mm_add_ps(p0, p1);
 231     x         = _mm_mul_ps(p0, fexppart);
 232
 233     return x;
 234 }
 235
 236
 237 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 238  * but there will then be a small rounding error since we lose some precision due to the
 239  * multiplication. This will then be magnified a lot by the exponential.
 240  *
 241  * Instead, we calculate the fractional part directly as a minimax approximation of
 242  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 243  * remaining after 2^y, which avoids the precision-loss.
 244  * The final result is correct to within 1 LSB over the entire argument range.
 245  */
 246 static __m128
 247 gmx_mm_exp_ps(__m128 x)
 248 {
 249     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 250     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 251     const __m128  arglimit      = _mm_set1_ps(126.0f);
 252     const __m128i expbase       = _mm_set1_epi32(127);
 253
 254     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 255     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 256
 257     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 258     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 259     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 260     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 261     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 262     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 263     const __m128  one           = _mm_set1_ps(1.0f);
 264
 265     __m128        y, x2;
 266     __m128        p0, p1;
 267     __m128        valuemask;
 268     __m128i       iexppart;
 269     __m128        fexppart;
 270     __m128        intpart;
 271
 272     y = _mm_mul_ps(x, argscale);
 273
 274     iexppart  = _mm_cvtps_epi32(y);
 275     intpart   = _mm_cvtepi32_ps(iexppart);
 276
 277     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 278     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
 279     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 280
 281     /* Extended precision arithmetics */
 282     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
 283     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
 284
 285     x2        = _mm_mul_ps(x, x);
 286
 287     p1        = _mm_mul_ps(CC5, x2);
 288     p0        = _mm_mul_ps(CC4, x2);
 289     p1        = _mm_add_ps(p1, CC3);
 290     p0        = _mm_add_ps(p0, CC2);
 291     p1        = _mm_mul_ps(p1, x2);
 292     p0        = _mm_mul_ps(p0, x2);
 293     p1        = _mm_add_ps(p1, CC1);
 294     p0        = _mm_add_ps(p0, CC0);
 295     p1        = _mm_mul_ps(p1, x);
 296     p0        = _mm_add_ps(p0, p1);
 297     p0        = _mm_mul_ps(p0, x2);
 298     x         = _mm_add_ps(x, one);
 299     x         = _mm_add_ps(x, p0);
 300
 301     x         = _mm_mul_ps(x, fexppart);
 302
 303     return x;
 304 }
 305
 306 /* FULL precision. Only errors in LSB */
 307 static __m128
 308 gmx_mm_erf_ps(__m128 x)
 309 {
 310     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 311     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 312     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 313     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 314     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 315     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 316     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 317     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 318     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 319     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 320     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 321     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 322     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 323     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 324     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 325     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 326     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 327     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 328     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 329     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 330     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 331     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 332     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 333     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 334     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 335     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 336     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 337     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 338     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 339     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 340     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 341
 342     /* Coefficients for expansion of exp(x) in [0,0.1] */
 343     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 344     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 345     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 346     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 347
 348     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 349     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 350     const __m128  one      = _mm_set1_ps(1.0f);
 351     const __m128  two      = _mm_set1_ps(2.0f);
 352
 353     __m128        x2, x4, y;
 354     __m128        z, q, t, t2, w, w2;
 355     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 356     __m128        expmx2, corr;
 357     __m128        res_erf, res_erfc, res;
 358     __m128        mask;
 359
 360     /* Calculate erf() */
 361     x2     = _mm_mul_ps(x, x);
 362     x4     = _mm_mul_ps(x2, x2);
 363
 364     pA0  = _mm_mul_ps(CA6, x4);
 365     pA1  = _mm_mul_ps(CA5, x4);
 366     pA0  = _mm_add_ps(pA0, CA4);
 367     pA1  = _mm_add_ps(pA1, CA3);
 368     pA0  = _mm_mul_ps(pA0, x4);
 369     pA1  = _mm_mul_ps(pA1, x4);
 370     pA0  = _mm_add_ps(pA0, CA2);
 371     pA1  = _mm_add_ps(pA1, CA1);
 372     pA0  = _mm_mul_ps(pA0, x4);
 373     pA1  = _mm_mul_ps(pA1, x2);
 374     pA0  = _mm_add_ps(pA0, pA1);
 375     pA0  = _mm_add_ps(pA0, CA0);
 376
 377     res_erf = _mm_mul_ps(x, pA0);
 378
 379     /* Calculate erfc */
 380
 381     y       = gmx_mm_abs_ps(x);
 382     t       = gmx_mm_inv_ps(y);
 383     w       = _mm_sub_ps(t, one);
 384     t2      = _mm_mul_ps(t, t);
 385     w2      = _mm_mul_ps(w, w);
 386     /*
 387      * We cannot simply calculate exp(-x2) directly in single precision, since
 388      * that will lose a couple of bits of precision due to the multiplication.
 389      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 390      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 391      *
 392      * The only drawback with this is that it requires TWO separate exponential
 393      * evaluations, which would be horrible performance-wise. However, the argument
 394      * for the second exp() call is always small, so there we simply use a
 395      * low-order minimax expansion on [0,0.1].
 396      */
 397
 398     z       = _mm_and_ps(y, sieve);
 399     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 400
 401     corr    = _mm_mul_ps(CD4, q);
 402     corr    = _mm_add_ps(corr, CD3);
 403     corr    = _mm_mul_ps(corr, q);
 404     corr    = _mm_add_ps(corr, CD2);
 405     corr    = _mm_mul_ps(corr, q);
 406     corr    = _mm_add_ps(corr, one);
 407     corr    = _mm_mul_ps(corr, q);
 408     corr    = _mm_add_ps(corr, one);
 409
 410     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 411     expmx2  = _mm_mul_ps(expmx2, corr);
 412
 413     pB1  = _mm_mul_ps(CB9, w2);
 414     pB0  = _mm_mul_ps(CB8, w2);
 415     pB1  = _mm_add_ps(pB1, CB7);
 416     pB0  = _mm_add_ps(pB0, CB6);
 417     pB1  = _mm_mul_ps(pB1, w2);
 418     pB0  = _mm_mul_ps(pB0, w2);
 419     pB1  = _mm_add_ps(pB1, CB5);
 420     pB0  = _mm_add_ps(pB0, CB4);
 421     pB1  = _mm_mul_ps(pB1, w2);
 422     pB0  = _mm_mul_ps(pB0, w2);
 423     pB1  = _mm_add_ps(pB1, CB3);
 424     pB0  = _mm_add_ps(pB0, CB2);
 425     pB1  = _mm_mul_ps(pB1, w2);
 426     pB0  = _mm_mul_ps(pB0, w2);
 427     pB1  = _mm_add_ps(pB1, CB1);
 428     pB1  = _mm_mul_ps(pB1, w);
 429     pB0  = _mm_add_ps(pB0, pB1);
 430     pB0  = _mm_add_ps(pB0, CB0);
 431
 432     pC0  = _mm_mul_ps(CC10, t2);
 433     pC1  = _mm_mul_ps(CC9, t2);
 434     pC0  = _mm_add_ps(pC0, CC8);
 435     pC1  = _mm_add_ps(pC1, CC7);
 436     pC0  = _mm_mul_ps(pC0, t2);
 437     pC1  = _mm_mul_ps(pC1, t2);
 438     pC0  = _mm_add_ps(pC0, CC6);
 439     pC1  = _mm_add_ps(pC1, CC5);
 440     pC0  = _mm_mul_ps(pC0, t2);
 441     pC1  = _mm_mul_ps(pC1, t2);
 442     pC0  = _mm_add_ps(pC0, CC4);
 443     pC1  = _mm_add_ps(pC1, CC3);
 444     pC0  = _mm_mul_ps(pC0, t2);
 445     pC1  = _mm_mul_ps(pC1, t2);
 446     pC0  = _mm_add_ps(pC0, CC2);
 447     pC1  = _mm_add_ps(pC1, CC1);
 448     pC0  = _mm_mul_ps(pC0, t2);
 449     pC1  = _mm_mul_ps(pC1, t);
 450     pC0  = _mm_add_ps(pC0, pC1);
 451     pC0  = _mm_add_ps(pC0, CC0);
 452     pC0  = _mm_mul_ps(pC0, t);
 453
 454     /* SELECT pB0 or pC0 for erfc() */
 455     mask     = _mm_cmplt_ps(two, y);
 456     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
 457     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 458
 459     /* erfc(x<0) = 2-erfc(|x|) */
 460     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 461     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
 462                          _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
 463
 464     /* Select erf() or erfc() */
 465     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 466     res  = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
 467
 468     return res;
 469 }
 470
 471
 472
 473
 474
 475 /* FULL precision. Only errors in LSB */
 476 static __m128
 477 gmx_mm_erfc_ps(__m128 x)
 478 {
 479     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 480     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 481     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 482     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 483     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 484     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 485     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 486     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 487     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 488     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 489     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 490     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 491     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 492     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 493     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 494     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 495     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 496     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 497     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 498     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 499     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 500     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 501     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 502     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 503     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 504     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 505     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 506     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 507     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 508     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 509     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 510
 511     /* Coefficients for expansion of exp(x) in [0,0.1] */
 512     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 513     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 514     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 515     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 516
 517     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 518     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 519     const __m128  one      = _mm_set1_ps(1.0f);
 520     const __m128  two      = _mm_set1_ps(2.0f);
 521
 522     __m128        x2, x4, y;
 523     __m128        z, q, t, t2, w, w2;
 524     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 525     __m128        expmx2, corr;
 526     __m128        res_erf, res_erfc, res;
 527     __m128        mask;
 528
 529     /* Calculate erf() */
 530     x2     = _mm_mul_ps(x, x);
 531     x4     = _mm_mul_ps(x2, x2);
 532
 533     pA0  = _mm_mul_ps(CA6, x4);
 534     pA1  = _mm_mul_ps(CA5, x4);
 535     pA0  = _mm_add_ps(pA0, CA4);
 536     pA1  = _mm_add_ps(pA1, CA3);
 537     pA0  = _mm_mul_ps(pA0, x4);
 538     pA1  = _mm_mul_ps(pA1, x4);
 539     pA0  = _mm_add_ps(pA0, CA2);
 540     pA1  = _mm_add_ps(pA1, CA1);
 541     pA0  = _mm_mul_ps(pA0, x4);
 542     pA1  = _mm_mul_ps(pA1, x2);
 543     pA0  = _mm_add_ps(pA0, pA1);
 544     pA0  = _mm_add_ps(pA0, CA0);
 545
 546     res_erf = _mm_mul_ps(x, pA0);
 547
 548     /* Calculate erfc */
 549     y       = gmx_mm_abs_ps(x);
 550     t       = gmx_mm_inv_ps(y);
 551     w       = _mm_sub_ps(t, one);
 552     t2      = _mm_mul_ps(t, t);
 553     w2      = _mm_mul_ps(w, w);
 554     /*
 555      * We cannot simply calculate exp(-x2) directly in single precision, since
 556      * that will lose a couple of bits of precision due to the multiplication.
 557      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 558      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 559      *
 560      * The only drawback with this is that it requires TWO separate exponential
 561      * evaluations, which would be horrible performance-wise. However, the argument
 562      * for the second exp() call is always small, so there we simply use a
 563      * low-order minimax expansion on [0,0.1].
 564      */
 565
 566     z       = _mm_and_ps(y, sieve);
 567     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 568
 569     corr    = _mm_mul_ps(CD4, q);
 570     corr    = _mm_add_ps(corr, CD3);
 571     corr    = _mm_mul_ps(corr, q);
 572     corr    = _mm_add_ps(corr, CD2);
 573     corr    = _mm_mul_ps(corr, q);
 574     corr    = _mm_add_ps(corr, one);
 575     corr    = _mm_mul_ps(corr, q);
 576     corr    = _mm_add_ps(corr, one);
 577
 578     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 579     expmx2  = _mm_mul_ps(expmx2, corr);
 580
 581     pB1  = _mm_mul_ps(CB9, w2);
 582     pB0  = _mm_mul_ps(CB8, w2);
 583     pB1  = _mm_add_ps(pB1, CB7);
 584     pB0  = _mm_add_ps(pB0, CB6);
 585     pB1  = _mm_mul_ps(pB1, w2);
 586     pB0  = _mm_mul_ps(pB0, w2);
 587     pB1  = _mm_add_ps(pB1, CB5);
 588     pB0  = _mm_add_ps(pB0, CB4);
 589     pB1  = _mm_mul_ps(pB1, w2);
 590     pB0  = _mm_mul_ps(pB0, w2);
 591     pB1  = _mm_add_ps(pB1, CB3);
 592     pB0  = _mm_add_ps(pB0, CB2);
 593     pB1  = _mm_mul_ps(pB1, w2);
 594     pB0  = _mm_mul_ps(pB0, w2);
 595     pB1  = _mm_add_ps(pB1, CB1);
 596     pB1  = _mm_mul_ps(pB1, w);
 597     pB0  = _mm_add_ps(pB0, pB1);
 598     pB0  = _mm_add_ps(pB0, CB0);
 599
 600     pC0  = _mm_mul_ps(CC10, t2);
 601     pC1  = _mm_mul_ps(CC9, t2);
 602     pC0  = _mm_add_ps(pC0, CC8);
 603     pC1  = _mm_add_ps(pC1, CC7);
 604     pC0  = _mm_mul_ps(pC0, t2);
 605     pC1  = _mm_mul_ps(pC1, t2);
 606     pC0  = _mm_add_ps(pC0, CC6);
 607     pC1  = _mm_add_ps(pC1, CC5);
 608     pC0  = _mm_mul_ps(pC0, t2);
 609     pC1  = _mm_mul_ps(pC1, t2);
 610     pC0  = _mm_add_ps(pC0, CC4);
 611     pC1  = _mm_add_ps(pC1, CC3);
 612     pC0  = _mm_mul_ps(pC0, t2);
 613     pC1  = _mm_mul_ps(pC1, t2);
 614     pC0  = _mm_add_ps(pC0, CC2);
 615     pC1  = _mm_add_ps(pC1, CC1);
 616     pC0  = _mm_mul_ps(pC0, t2);
 617     pC1  = _mm_mul_ps(pC1, t);
 618     pC0  = _mm_add_ps(pC0, pC1);
 619     pC0  = _mm_add_ps(pC0, CC0);
 620     pC0  = _mm_mul_ps(pC0, t);
 621
 622     /* SELECT pB0 or pC0 for erfc() */
 623     mask     = _mm_cmplt_ps(two, y);
 624     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
 625     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 626
 627     /* erfc(x<0) = 2-erfc(|x|) */
 628     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 629     res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
 630
 631     /* Select erf() or erfc() */
 632     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 633     res  = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
 634
 635     return res;
 636 }
 637
 638
 639 /* Calculate the force correction due to PME analytically.
 640  *
 641  * This routine is meant to enable analytical evaluation of the
 642  * direct-space PME electrostatic force to avoid tables.
 643  *
 644  * The direct-space potential should be Erfc(beta*r)/r, but there
 645  * are some problems evaluating that:
 646  *
 647  * First, the error function is difficult (read: expensive) to
 648  * approxmiate accurately for intermediate to large arguments, and
 649  * this happens already in ranges of beta*r that occur in simulations.
 650  * Second, we now try to avoid calculating potentials in Gromacs but
 651  * use forces directly.
 652  *
 653  * We can simply things slight by noting that the PME part is really
 654  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 655  *
 656  * V= 1/r - Erf(beta*r)/r
 657  *
 658  * The first term we already have from the inverse square root, so
 659  * that we can leave out of this routine.
 660  *
 661  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 662  * the argument beta*r will be in the range 0.15 to ~4. Use your
 663  * favorite plotting program to realize how well-behaved Erf(z)/z is
 664  * in this range!
 665  *
 666  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 667  * However, it turns out it is more efficient to approximate f(z)/z and
 668  * then only use even powers. This is another minor optimization, since
 669  * we actually WANT f(z)/z, because it is going to be multiplied by
 670  * the vector between the two atoms to get the vectorial force. The
 671  * fastest flops are the ones we can avoid calculating!
 672  *
 673  * So, here's how it should be used:
 674  *
 675  * 1. Calculate r^2.
 676  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 677  * 3. Evaluate this routine with z^2 as the argument.
 678  * 4. The return value is the expression:
 679  *
 680  *
 681  *       2*exp(-z^2)     erf(z)
 682  *       ------------ - --------
 683  *       sqrt(Pi)*z^2      z^3
 684  *
 685  * 5. Multiply the entire expression by beta^3. This will get you
 686  *
 687  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 688  *       ------------------  - ---------------
 689  *          sqrt(Pi)*z^2            z^3
 690  *
 691  *    or, switching back to r (z=r*beta):
 692  *
 693  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 694  *       ----------------------- - -----------
 695  *            sqrt(Pi)*r^2            r^3
 696  *
 697  *
 698  *    With a bit of math exercise you should be able to confirm that
 699  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 700  *
 701  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 702  *    and you have your force (divided by r). A final multiplication
 703  *    with the vector connecting the two particles and you have your
 704  *    vectorial force to add to the particles.
 705  *
 706  */
 707 static gmx_inline __m128
 708 gmx_mm_pmecorrF_ps(__m128 z2)
 709 {
 710     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 711     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 712     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 713     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 714     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 715     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 716     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 717
 718     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 719     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 720     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 721     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 722     const __m128  FD0      = _mm_set1_ps(1.0f);
 723
 724     __m128        z4;
 725     __m128        polyFN0, polyFN1, polyFD0, polyFD1;
 726
 727     z4             = _mm_mul_ps(z2, z2);
 728
 729     polyFD0        = _mm_mul_ps(FD4, z4);
 730     polyFD1        = _mm_mul_ps(FD3, z4);
 731     polyFD0        = _mm_add_ps(polyFD0, FD2);
 732     polyFD1        = _mm_add_ps(polyFD1, FD1);
 733     polyFD0        = _mm_mul_ps(polyFD0, z4);
 734     polyFD1        = _mm_mul_ps(polyFD1, z2);
 735     polyFD0        = _mm_add_ps(polyFD0, FD0);
 736     polyFD0        = _mm_add_ps(polyFD0, polyFD1);
 737
 738     polyFD0        = gmx_mm_inv_ps(polyFD0);
 739
 740     polyFN0        = _mm_mul_ps(FN6, z4);
 741     polyFN1        = _mm_mul_ps(FN5, z4);
 742     polyFN0        = _mm_add_ps(polyFN0, FN4);
 743     polyFN1        = _mm_add_ps(polyFN1, FN3);
 744     polyFN0        = _mm_mul_ps(polyFN0, z4);
 745     polyFN1        = _mm_mul_ps(polyFN1, z4);
 746     polyFN0        = _mm_add_ps(polyFN0, FN2);
 747     polyFN1        = _mm_add_ps(polyFN1, FN1);
 748     polyFN0        = _mm_mul_ps(polyFN0, z4);
 749     polyFN1        = _mm_mul_ps(polyFN1, z2);
 750     polyFN0        = _mm_add_ps(polyFN0, FN0);
 751     polyFN0        = _mm_add_ps(polyFN0, polyFN1);
 752
 753     return _mm_mul_ps(polyFN0, polyFD0);
 754 }
 755
 756
 757 /* Calculate the potential correction due to PME analytically.
 758  *
 759  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 760  *
 761  * This routine calculates Erf(z)/z, although you should provide z^2
 762  * as the input argument.
 763  *
 764  * Here's how it should be used:
 765  *
 766  * 1. Calculate r^2.
 767  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 768  * 3. Evaluate this routine with z^2 as the argument.
 769  * 4. The return value is the expression:
 770  *
 771  *
 772  *        erf(z)
 773  *       --------
 774  *          z
 775  *
 776  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 777  *
 778  *       erf(r*beta)
 779  *       -----------
 780  *           r
 781  *
 782  * 6. Subtract the result from 1/r, multiply by the product of the charges,
 783  *    and you have your potential.
 784  */
 785 static gmx_inline __m128
 786 gmx_mm_pmecorrV_ps(__m128 z2)
 787 {
 788     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 789     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 790     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 791     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 792     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 793     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 794     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 795
 796     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 797     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 798     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 799     const __m128  VD0      = _mm_set1_ps(1.0f);
 800
 801     __m128        z4;
 802     __m128        polyVN0, polyVN1, polyVD0, polyVD1;
 803
 804     z4             = _mm_mul_ps(z2, z2);
 805
 806     polyVD1        = _mm_mul_ps(VD3, z4);
 807     polyVD0        = _mm_mul_ps(VD2, z4);
 808     polyVD1        = _mm_add_ps(polyVD1, VD1);
 809     polyVD0        = _mm_add_ps(polyVD0, VD0);
 810     polyVD1        = _mm_mul_ps(polyVD1, z2);
 811     polyVD0        = _mm_add_ps(polyVD0, polyVD1);
 812
 813     polyVD0        = gmx_mm_inv_ps(polyVD0);
 814
 815     polyVN0        = _mm_mul_ps(VN6, z4);
 816     polyVN1        = _mm_mul_ps(VN5, z4);
 817     polyVN0        = _mm_add_ps(polyVN0, VN4);
 818     polyVN1        = _mm_add_ps(polyVN1, VN3);
 819     polyVN0        = _mm_mul_ps(polyVN0, z4);
 820     polyVN1        = _mm_mul_ps(polyVN1, z4);
 821     polyVN0        = _mm_add_ps(polyVN0, VN2);
 822     polyVN1        = _mm_add_ps(polyVN1, VN1);
 823     polyVN0        = _mm_mul_ps(polyVN0, z4);
 824     polyVN1        = _mm_mul_ps(polyVN1, z2);
 825     polyVN0        = _mm_add_ps(polyVN0, VN0);
 826     polyVN0        = _mm_add_ps(polyVN0, polyVN1);
 827
 828     return _mm_mul_ps(polyVN0, polyVD0);
 829 }
 830
 831
 832 static int
 833 gmx_mm_sincos_ps(__m128  x,
 834                  __m128 *sinval,
 835                  __m128 *cosval)
 836 {
 837     const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
 838     const __m128  half        = _mm_set1_ps(0.5);
 839     const __m128  one         = _mm_set1_ps(1.0);
 840
 841     const __m128i izero      = _mm_set1_epi32(0);
 842     const __m128i ione       = _mm_set1_epi32(1);
 843     const __m128i itwo       = _mm_set1_epi32(2);
 844     const __m128i ithree     = _mm_set1_epi32(3);
 845     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 846
 847     const __m128  CA1         = _mm_set1_ps(1.5703125f);
 848     const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 849     const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 850
 851     const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
 852     const __m128  CC1         = _mm_set1_ps(0.0416566950f);
 853     const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
 854     const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
 855     const __m128  CS1         = _mm_set1_ps(0.0083320758f);
 856     const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
 857
 858     __m128        y, y2;
 859     __m128        z;
 860     __m128i       iz;
 861     __m128i       offset_sin, offset_cos;
 862     __m128        tmp1, tmp2;
 863     __m128        mask_sin, mask_cos;
 864     __m128        tmp_sin, tmp_cos;
 865
 866     y          = _mm_mul_ps(x, two_over_pi);
 867     y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
 868
 869     iz         = _mm_cvttps_epi32(y);
 870     z          = _mm_cvtepi32_ps(iz);
 871
 872     offset_sin = _mm_and_si128(iz, ithree);
 873     offset_cos = _mm_add_epi32(iz, ione);
 874
 875     /* Extended precision arithmethic to achieve full precision */
 876     y               = _mm_mul_ps(z, CA1);
 877     tmp1            = _mm_mul_ps(z, CA2);
 878     tmp2            = _mm_mul_ps(z, CA3);
 879     y               = _mm_sub_ps(x, y);
 880     y               = _mm_sub_ps(y, tmp1);
 881     y               = _mm_sub_ps(y, tmp2);
 882
 883     y2              = _mm_mul_ps(y, y);
 884
 885     tmp1            = _mm_mul_ps(CC0, y2);
 886     tmp1            = _mm_add_ps(tmp1, CC1);
 887     tmp2            = _mm_mul_ps(CS0, y2);
 888     tmp2            = _mm_add_ps(tmp2, CS1);
 889     tmp1            = _mm_mul_ps(tmp1, y2);
 890     tmp1            = _mm_add_ps(tmp1, CC2);
 891     tmp2            = _mm_mul_ps(tmp2, y2);
 892     tmp2            = _mm_add_ps(tmp2, CS2);
 893
 894     tmp1            = _mm_mul_ps(tmp1, y2);
 895     tmp1            = _mm_add_ps(tmp1, one);
 896
 897     tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
 898     tmp2            = _mm_add_ps(tmp2, y);
 899
 900     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
 901     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
 902
 903     tmp_sin         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
 904     tmp_cos         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
 905
 906     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
 907     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
 908
 909     tmp1            = _mm_xor_ps(signbit, tmp_sin);
 910     tmp2            = _mm_xor_ps(signbit, tmp_cos);
 911
 912     *sinval         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
 913     *cosval         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
 914
 915     return 0;
 916 }
 917
 918 /*
 919  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 920  * will then call the sincos() routine and waste a factor 2 in performance!
 921  */
 922 static __m128
 923 gmx_mm_sin_ps(__m128 x)
 924 {
 925     __m128 s, c;
 926     gmx_mm_sincos_ps(x, &s, &c);
 927     return s;
 928 }
 929
 930 /*
 931  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 932  * will then call the sincos() routine and waste a factor 2 in performance!
 933  */
 934 static __m128
 935 gmx_mm_cos_ps(__m128 x)
 936 {
 937     __m128 s, c;
 938     gmx_mm_sincos_ps(x, &s, &c);
 939     return c;
 940 }
 941
 942
 943 static __m128
 944 gmx_mm_tan_ps(__m128 x)
 945 {
 946     __m128 sinval, cosval;
 947     __m128 tanval;
 948
 949     gmx_mm_sincos_ps(x, &sinval, &cosval);
 950
 951     tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
 952
 953     return tanval;
 954 }
 955
 956
 957 static __m128
 958 gmx_mm_asin_ps(__m128 x)
 959 {
 960     /* Same algorithm as cephes library */
 961     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 962     const __m128 limitlow  = _mm_set1_ps(1e-4f);
 963     const __m128 half      = _mm_set1_ps(0.5f);
 964     const __m128 one       = _mm_set1_ps(1.0f);
 965     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 966
 967     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 968     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 969     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 970     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 971     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 972
 973     __m128       sign;
 974     __m128       mask;
 975     __m128       xabs;
 976     __m128       z, z1, z2, q, q1, q2;
 977     __m128       pA, pB;
 978
 979     sign  = _mm_andnot_ps(signmask, x);
 980     xabs  = _mm_and_ps(x, signmask);
 981
 982     mask  = _mm_cmpgt_ps(xabs, half);
 983
 984     z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
 985     q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
 986     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
 987
 988     q2    = xabs;
 989     z2    = _mm_mul_ps(q2, q2);
 990
 991     z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
 992     q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
 993
 994     z2    = _mm_mul_ps(z, z);
 995
 996     pA    = _mm_mul_ps(CC5, z2);
 997     pB    = _mm_mul_ps(CC4, z2);
 998
 999     pA    = _mm_add_ps(pA, CC3);
1000     pB    = _mm_add_ps(pB, CC2);
1001
1002     pA    = _mm_mul_ps(pA, z2);
1003     pB    = _mm_mul_ps(pB, z2);
1004
1005     pA    = _mm_add_ps(pA, CC1);
1006     pA    = _mm_mul_ps(pA, z);
1007
1008     z     = _mm_add_ps(pA, pB);
1009     z     = _mm_mul_ps(z, q);
1010     z     = _mm_add_ps(z, q);
1011
1012     q2    = _mm_sub_ps(halfpi, z);
1013     q2    = _mm_sub_ps(q2, z);
1014
1015     z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1016
1017     mask  = _mm_cmpgt_ps(xabs, limitlow);
1018     z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1019
1020     z = _mm_xor_ps(z, sign);
1021
1022     return z;
1023 }
1024
1025
1026 static __m128
1027 gmx_mm_acos_ps(__m128 x)
1028 {
1029     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1030     const __m128 one_ps    = _mm_set1_ps(1.0f);
1031     const __m128 half_ps   = _mm_set1_ps(0.5f);
1032     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1033     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1034
1035     __m128       mask1;
1036     __m128       mask2;
1037     __m128       xabs;
1038     __m128       z, z1, z2, z3;
1039
1040     xabs  = _mm_and_ps(x, signmask);
1041     mask1 = _mm_cmpgt_ps(xabs, half_ps);
1042     mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1043
1044     z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1045     z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1046     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1047
1048     z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
1049     z     = gmx_mm_asin_ps(z);
1050
1051     z2    = _mm_add_ps(z, z);
1052     z1    = _mm_sub_ps(pi_ps, z2);
1053     z3    = _mm_sub_ps(halfpi_ps, z);
1054
1055     z     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
1056     z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
1057
1058     return z;
1059 }
1060
1061
1062 static __m128
1063 gmx_mm_atan_ps(__m128 x)
1064 {
1065     /* Same algorithm as cephes library */
1066     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1067     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
1068     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
1069     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1070     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
1071     const __m128 mone      = _mm_set1_ps(-1.0f);
1072     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
1073     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
1074     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
1075     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
1076
1077     __m128       sign;
1078     __m128       mask1, mask2;
1079     __m128       y, z1, z2;
1080     __m128       x2, x4;
1081     __m128       sum1, sum2;
1082
1083     sign  = _mm_andnot_ps(signmask, x);
1084     x     = _mm_and_ps(x, signmask);
1085
1086     mask1 = _mm_cmpgt_ps(x, limit1);
1087     mask2 = _mm_cmpgt_ps(x, limit2);
1088
1089     z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
1090     z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
1091
1092     y     = _mm_and_ps(mask1, quarterpi);
1093     y     = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
1094
1095     x     = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
1096     x     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
1097
1098     x2    = _mm_mul_ps(x, x);
1099     x4    = _mm_mul_ps(x2, x2);
1100
1101     sum1  = _mm_mul_ps(CC9, x4);
1102     sum2  = _mm_mul_ps(CC7, x4);
1103     sum1  = _mm_add_ps(sum1, CC5);
1104     sum2  = _mm_add_ps(sum2, CC3);
1105     sum1  = _mm_mul_ps(sum1, x4);
1106     sum2  = _mm_mul_ps(sum2, x2);
1107
1108     sum1  = _mm_add_ps(sum1, sum2);
1109     sum1  = _mm_sub_ps(sum1, mone);
1110     sum1  = _mm_mul_ps(sum1, x);
1111     y     = _mm_add_ps(y, sum1);
1112
1113     y     = _mm_xor_ps(y, sign);
1114
1115     return y;
1116 }
1117
1118
1119 static __m128
1120 gmx_mm_atan2_ps(__m128 y, __m128 x)
1121 {
1122     const __m128 pi          = _mm_set1_ps(M_PI);
1123     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1124     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1125     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1126
1127     __m128       z, z1, z3, z4;
1128     __m128       w;
1129     __m128       maskx_lt, maskx_eq;
1130     __m128       masky_lt, masky_eq;
1131     __m128       mask1, mask2, mask3, mask4, maskall;
1132
1133     maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
1134     masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
1135     maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
1136     masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
1137
1138     z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
1139     z         = gmx_mm_atan_ps(z);
1140
1141     mask1     = _mm_and_ps(maskx_eq, masky_lt);
1142     mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
1143     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
1144     mask4     = _mm_and_ps(masky_eq, maskx_lt);
1145
1146     maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
1147
1148     z         = _mm_andnot_ps(maskall, z);
1149     z1        = _mm_and_ps(mask1, minushalfpi);
1150     z3        = _mm_and_ps(mask3, halfpi);
1151     z4        = _mm_and_ps(mask4, pi);
1152
1153     z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
1154
1155     mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
1156     mask2     = _mm_and_ps(maskx_lt, masky_lt);
1157
1158     w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
1159     w         = _mm_andnot_ps(maskall, w);
1160
1161     z         = _mm_add_ps(z, w);
1162
1163     return z;
1164 }
1165
1166
1167 #endif /* _gmx_math_x86_sse2_single_h_ */