include/gmx_math_x86_sse4_1_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_math_x86_sse4_1_single_h_
  22 #define _gmx_math_x86_sse4_1_single_h_
  23
  24 #include <stdio.h>
  25 #include <math.h>
  26
  27 #include "gmx_x86_sse4.h"
  28
  29
  30
  31 #ifndef M_PI
  32 #  define M_PI 3.14159265358979323846264338327950288
  33 #endif
  34
  35
  36
  37
  38 /************************
  39  *                      *
  40  * Simple math routines *
  41  *                      *
  42  ************************/
  43
  44 /* 1.0/sqrt(x) */
  45 static gmx_inline __m128
  46 gmx_mm_invsqrt_ps(__m128 x)
  47 {
  48     const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
  49     const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
  50
  51     __m128 lu = _mm_rsqrt_ps(x);
  52
  53     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
  54 }
  55
  56 /* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  57 static gmx_inline __m128
  58 gmx_mm_sqrt_ps(__m128 x)
  59 {
  60     __m128 mask;
  61     __m128 res;
  62
  63     mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
  64     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
  65
  66     res  = _mm_mul_ps(x,res);
  67
  68     return res;
  69 }
  70
  71 /* 1.0/x */
  72 static gmx_inline __m128
  73 gmx_mm_inv_ps(__m128 x)
  74 {
  75     const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
  76
  77     __m128 lu = _mm_rcp_ps(x);
  78
  79     return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
  80 }
  81
  82 static gmx_inline __m128
  83 gmx_mm_abs_ps(__m128 x)
  84 {
  85     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
  86
  87     return _mm_and_ps(x,signmask);
  88 }
  89
  90
  91
  92 static __m128
  93 gmx_mm_log_ps(__m128 x)
  94 {
  95     /* Same algorithm as cephes library */
  96     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
  97     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
  98     const __m128  half       = _mm_set1_ps(0.5f);
  99     const __m128  one        = _mm_set1_ps(1.0f);
 100     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 101     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 102     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 103
 104     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
 105     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
 106     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
 107     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
 108     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
 109     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
 110     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
 111     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
 112     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
 113
 114     __m128  fexp,fexp1;
 115     __m128i iexp;
 116     __m128  mask;
 117     __m128  x1,x2;
 118     __m128  y;
 119     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 120
 121     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 122     fexp  = _mm_and_ps(x,expmask);
 123     iexp  = gmx_mm_castps_si128(fexp);
 124     iexp  = _mm_srli_epi32(iexp,23);
 125     iexp  = _mm_sub_epi32(iexp,expbase_m1);
 126
 127     x     = _mm_andnot_ps(expmask,x);
 128     x     = _mm_or_ps(x,one);
 129     x     = _mm_mul_ps(x,half);
 130
 131     mask  = _mm_cmplt_ps(x,invsq2);
 132
 133     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
 134     x     = _mm_sub_ps(x,one);
 135     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 136
 137     x2    = _mm_mul_ps(x,x);
 138
 139     pA    = _mm_mul_ps(CA_1,x);
 140     pB    = _mm_mul_ps(CB_1,x);
 141     pC    = _mm_mul_ps(CC_1,x);
 142     pD    = _mm_mul_ps(CD_1,x);
 143     pE    = _mm_mul_ps(CE_1,x);
 144     tB    = _mm_add_ps(CB_0,x2);
 145     tC    = _mm_add_ps(CC_0,x2);
 146     tD    = _mm_add_ps(CD_0,x2);
 147     tE    = _mm_add_ps(CE_0,x2);
 148     pB    = _mm_add_ps(pB,tB);
 149     pC    = _mm_add_ps(pC,tC);
 150     pD    = _mm_add_ps(pD,tD);
 151     pE    = _mm_add_ps(pE,tE);
 152
 153     pA    = _mm_mul_ps(pA,pB);
 154     pC    = _mm_mul_ps(pC,pD);
 155     pE    = _mm_mul_ps(pE,x2);
 156     pA    = _mm_mul_ps(pA,pC);
 157     y     = _mm_mul_ps(pA,pE);
 158
 159     fexp  = _mm_cvtepi32_ps(iexp);
 160     y     = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
 161
 162     y     = _mm_sub_ps(y, _mm_mul_ps(half,x2));
 163     x2    = _mm_add_ps(x,y);
 164
 165     x2    = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
 166
 167     return x2;
 168 }
 169
 170
 171 /*
 172  * 2^x function.
 173  *
 174  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 175  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 176  *
 177  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 178  *
 179  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 180  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 181  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 182  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 183  * number instead. That would take a few extra cycles and not really help, since something is
 184  * wrong if you are using single precision to work with numbers that cannot really be represented
 185  * in single precision.
 186  *
 187  * The accuracy is at least 23 bits.
 188  */
 189 static __m128
 190 gmx_mm_exp2_ps(__m128 x)
 191 {
 192     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 193     const __m128  arglimit = _mm_set1_ps(126.0f);
 194
 195     const __m128i expbase  = _mm_set1_epi32(127);
 196     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 197     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 198     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 199     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 200     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 201     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 202     const __m128  CA0      = _mm_set1_ps(1.0f);
 203
 204     __m128  valuemask;
 205     __m128i iexppart;
 206     __m128  fexppart;
 207     __m128  intpart;
 208     __m128  x2;
 209     __m128  p0,p1;
 210
 211     iexppart  = _mm_cvtps_epi32(x);
 212     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 213     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 214     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
 215     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 216
 217     x         = _mm_sub_ps(x,intpart);
 218     x2        = _mm_mul_ps(x,x);
 219
 220     p0        = _mm_mul_ps(CA6,x2);
 221     p1        = _mm_mul_ps(CA5,x2);
 222     p0        = _mm_add_ps(p0,CA4);
 223     p1        = _mm_add_ps(p1,CA3);
 224     p0        = _mm_mul_ps(p0,x2);
 225     p1        = _mm_mul_ps(p1,x2);
 226     p0        = _mm_add_ps(p0,CA2);
 227     p1        = _mm_add_ps(p1,CA1);
 228     p0        = _mm_mul_ps(p0,x2);
 229     p1        = _mm_mul_ps(p1,x);
 230     p0        = _mm_add_ps(p0,CA0);
 231     p0        = _mm_add_ps(p0,p1);
 232     x         = _mm_mul_ps(p0,fexppart);
 233
 234     return  x;
 235 }
 236
 237
 238 /* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 239  * but there will then be a small rounding error since we lose some precision due to the
 240  * multiplication. This will then be magnified a lot by the exponential.
 241  *
 242  * Instead, we calculate the fractional part directly as a minimax approximation of
 243  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 244  * remaining after 2^y, which avoids the precision-loss.
 245  * The final result is correct to within 1 LSB over the entire argument range.
 246  */
 247 static __m128
 248 gmx_mm_exp_ps(__m128 x)
 249 {
 250     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 251     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 252     const __m128  arglimit      = _mm_set1_ps(126.0f);
 253     const __m128i expbase       = _mm_set1_epi32(127);
 254
 255     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 256     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 257
 258     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 259     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 260     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 261     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 262     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 263     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 264     const __m128  one           = _mm_set1_ps(1.0f);
 265
 266     __m128  y,x2;
 267     __m128  p0,p1;
 268     __m128  valuemask;
 269     __m128i iexppart;
 270     __m128  fexppart;
 271     __m128  intpart;
 272
 273     y = _mm_mul_ps(x,argscale);
 274
 275     iexppart  = _mm_cvtps_epi32(y);
 276     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
 277
 278     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 279     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
 280     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 281
 282     /* Extended precision arithmetics */
 283     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
 284     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
 285
 286     x2        = _mm_mul_ps(x,x);
 287
 288     p1        = _mm_mul_ps(CC5,x2);
 289     p0        = _mm_mul_ps(CC4,x2);
 290     p1        = _mm_add_ps(p1,CC3);
 291     p0        = _mm_add_ps(p0,CC2);
 292     p1        = _mm_mul_ps(p1,x2);
 293     p0        = _mm_mul_ps(p0,x2);
 294     p1        = _mm_add_ps(p1,CC1);
 295     p0        = _mm_add_ps(p0,CC0);
 296     p1        = _mm_mul_ps(p1,x);
 297     p0        = _mm_add_ps(p0,p1);
 298     p0        = _mm_mul_ps(p0,x2);
 299     x         = _mm_add_ps(x,one);
 300     x         = _mm_add_ps(x,p0);
 301
 302     x         = _mm_mul_ps(x,fexppart);
 303
 304     return x;
 305 }
 306
 307 /* FULL precision. Only errors in LSB */
 308 static __m128
 309 gmx_mm_erf_ps(__m128 x)
 310 {
 311     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 312     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 313     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 314     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 315     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 316     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 317     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 318     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 319     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 320     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 321     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 322     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 323     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 324     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 325     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 326     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 327     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 328     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 329     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 330     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 331     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 332     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 333     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 334     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 335     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 336     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 337     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 338     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 339     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 340     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 341     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 342
 343     /* Coefficients for expansion of exp(x) in [0,0.1] */
 344     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 345     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 346     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 347     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 348
 349     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 350     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 351     const __m128  one      = _mm_set1_ps(1.0f);
 352     const __m128  two      = _mm_set1_ps(2.0f);
 353
 354     __m128 x2,x4,y;
 355     __m128 z,q,t,t2,w,w2;
 356     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 357     __m128 expmx2,corr;
 358     __m128 res_erf,res_erfc,res;
 359     __m128 mask;
 360
 361     /* Calculate erf() */
 362     x2     = _mm_mul_ps(x,x);
 363     x4     = _mm_mul_ps(x2,x2);
 364
 365     pA0  = _mm_mul_ps(CA6,x4);
 366     pA1  = _mm_mul_ps(CA5,x4);
 367     pA0  = _mm_add_ps(pA0,CA4);
 368     pA1  = _mm_add_ps(pA1,CA3);
 369     pA0  = _mm_mul_ps(pA0,x4);
 370     pA1  = _mm_mul_ps(pA1,x4);
 371     pA0  = _mm_add_ps(pA0,CA2);
 372     pA1  = _mm_add_ps(pA1,CA1);
 373     pA0  = _mm_mul_ps(pA0,x4);
 374     pA1  = _mm_mul_ps(pA1,x2);
 375     pA0  = _mm_add_ps(pA0,pA1);
 376     pA0  = _mm_add_ps(pA0,CA0);
 377
 378     res_erf = _mm_mul_ps(x,pA0);
 379
 380     /* Calculate erfc */
 381
 382     y       = gmx_mm_abs_ps(x);
 383     t       = gmx_mm_inv_ps(y);
 384     w       = _mm_sub_ps(t,one);
 385     t2      = _mm_mul_ps(t,t);
 386     w2      = _mm_mul_ps(w,w);
 387     /*
 388      * We cannot simply calculate exp(-x2) directly in single precision, since
 389      * that will lose a couple of bits of precision due to the multiplication.
 390      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 391      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 392      *
 393      * The only drawback with this is that it requires TWO separate exponential
 394      * evaluations, which would be horrible performance-wise. However, the argument
 395      * for the second exp() call is always small, so there we simply use a
 396      * low-order minimax expansion on [0,0.1].
 397      */
 398
 399     z       = _mm_and_ps(y,sieve);
 400     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 401
 402     corr    = _mm_mul_ps(CD4,q);
 403     corr    = _mm_add_ps(corr,CD3);
 404     corr    = _mm_mul_ps(corr,q);
 405     corr    = _mm_add_ps(corr,CD2);
 406     corr    = _mm_mul_ps(corr,q);
 407     corr    = _mm_add_ps(corr,one);
 408     corr    = _mm_mul_ps(corr,q);
 409     corr    = _mm_add_ps(corr,one);
 410
 411     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 412     expmx2  = _mm_mul_ps(expmx2,corr);
 413
 414     pB1  = _mm_mul_ps(CB9,w2);
 415     pB0  = _mm_mul_ps(CB8,w2);
 416     pB1  = _mm_add_ps(pB1,CB7);
 417     pB0  = _mm_add_ps(pB0,CB6);
 418     pB1  = _mm_mul_ps(pB1,w2);
 419     pB0  = _mm_mul_ps(pB0,w2);
 420     pB1  = _mm_add_ps(pB1,CB5);
 421     pB0  = _mm_add_ps(pB0,CB4);
 422     pB1  = _mm_mul_ps(pB1,w2);
 423     pB0  = _mm_mul_ps(pB0,w2);
 424     pB1  = _mm_add_ps(pB1,CB3);
 425     pB0  = _mm_add_ps(pB0,CB2);
 426     pB1  = _mm_mul_ps(pB1,w2);
 427     pB0  = _mm_mul_ps(pB0,w2);
 428     pB1  = _mm_add_ps(pB1,CB1);
 429     pB1  = _mm_mul_ps(pB1,w);
 430     pB0  = _mm_add_ps(pB0,pB1);
 431     pB0  = _mm_add_ps(pB0,CB0);
 432
 433     pC0  = _mm_mul_ps(CC10,t2);
 434     pC1  = _mm_mul_ps(CC9,t2);
 435     pC0  = _mm_add_ps(pC0,CC8);
 436     pC1  = _mm_add_ps(pC1,CC7);
 437     pC0  = _mm_mul_ps(pC0,t2);
 438     pC1  = _mm_mul_ps(pC1,t2);
 439     pC0  = _mm_add_ps(pC0,CC6);
 440     pC1  = _mm_add_ps(pC1,CC5);
 441     pC0  = _mm_mul_ps(pC0,t2);
 442     pC1  = _mm_mul_ps(pC1,t2);
 443     pC0  = _mm_add_ps(pC0,CC4);
 444     pC1  = _mm_add_ps(pC1,CC3);
 445     pC0  = _mm_mul_ps(pC0,t2);
 446     pC1  = _mm_mul_ps(pC1,t2);
 447     pC0  = _mm_add_ps(pC0,CC2);
 448     pC1  = _mm_add_ps(pC1,CC1);
 449     pC0  = _mm_mul_ps(pC0,t2);
 450     pC1  = _mm_mul_ps(pC1,t);
 451     pC0  = _mm_add_ps(pC0,pC1);
 452     pC0  = _mm_add_ps(pC0,CC0);
 453     pC0  = _mm_mul_ps(pC0,t);
 454
 455     /* SELECT pB0 or pC0 for erfc() */
 456     mask = _mm_cmplt_ps(two,y);
 457     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 458     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 459
 460     /* erfc(x<0) = 2-erfc(|x|) */
 461     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
 462     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 463
 464     /* Select erf() or erfc() */
 465     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
 466     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
 467
 468     return res;
 469 }
 470
 471
 472 /* FULL precision. Only errors in LSB */
 473 static __m128
 474 gmx_mm_erfc_ps(__m128 x)
 475 {
 476     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 477     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 478     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 479     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 480     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 481     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 482     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 483     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 484     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 485     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 486     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 487     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 488     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 489     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 490     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 491     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 492     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 493     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 494     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 495     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 496     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 497     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 498     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 499     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 500     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 501     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 502     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 503     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 504     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 505     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 506     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 507
 508     /* Coefficients for expansion of exp(x) in [0,0.1] */
 509     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 510     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 511     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 512     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 513
 514     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 515     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 516     const __m128  one      = _mm_set1_ps(1.0f);
 517     const __m128  two      = _mm_set1_ps(2.0f);
 518
 519     __m128 x2,x4,y;
 520     __m128 z,q,t,t2,w,w2;
 521     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 522     __m128 expmx2,corr;
 523     __m128 res_erf,res_erfc,res;
 524     __m128 mask;
 525
 526     /* Calculate erf() */
 527     x2     = _mm_mul_ps(x,x);
 528     x4     = _mm_mul_ps(x2,x2);
 529
 530     pA0  = _mm_mul_ps(CA6,x4);
 531     pA1  = _mm_mul_ps(CA5,x4);
 532     pA0  = _mm_add_ps(pA0,CA4);
 533     pA1  = _mm_add_ps(pA1,CA3);
 534     pA0  = _mm_mul_ps(pA0,x4);
 535     pA1  = _mm_mul_ps(pA1,x4);
 536     pA0  = _mm_add_ps(pA0,CA2);
 537     pA1  = _mm_add_ps(pA1,CA1);
 538     pA0  = _mm_mul_ps(pA0,x4);
 539     pA1  = _mm_mul_ps(pA1,x2);
 540     pA0  = _mm_add_ps(pA0,pA1);
 541     pA0  = _mm_add_ps(pA0,CA0);
 542
 543     res_erf = _mm_mul_ps(x,pA0);
 544
 545     /* Calculate erfc */
 546     y       = gmx_mm_abs_ps(x);
 547     t       = gmx_mm_inv_ps(y);
 548     w       = _mm_sub_ps(t,one);
 549     t2      = _mm_mul_ps(t,t);
 550     w2      = _mm_mul_ps(w,w);
 551     /*
 552      * We cannot simply calculate exp(-x2) directly in single precision, since
 553      * that will lose a couple of bits of precision due to the multiplication.
 554      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 555      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 556      *
 557      * The only drawback with this is that it requires TWO separate exponential
 558      * evaluations, which would be horrible performance-wise. However, the argument
 559      * for the second exp() call is always small, so there we simply use a
 560      * low-order minimax expansion on [0,0.1].
 561      */
 562
 563     z       = _mm_and_ps(y,sieve);
 564     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 565
 566     corr    = _mm_mul_ps(CD4,q);
 567     corr    = _mm_add_ps(corr,CD3);
 568     corr    = _mm_mul_ps(corr,q);
 569     corr    = _mm_add_ps(corr,CD2);
 570     corr    = _mm_mul_ps(corr,q);
 571     corr    = _mm_add_ps(corr,one);
 572     corr    = _mm_mul_ps(corr,q);
 573     corr    = _mm_add_ps(corr,one);
 574
 575     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 576     expmx2  = _mm_mul_ps(expmx2,corr);
 577
 578     pB1  = _mm_mul_ps(CB9,w2);
 579     pB0  = _mm_mul_ps(CB8,w2);
 580     pB1  = _mm_add_ps(pB1,CB7);
 581     pB0  = _mm_add_ps(pB0,CB6);
 582     pB1  = _mm_mul_ps(pB1,w2);
 583     pB0  = _mm_mul_ps(pB0,w2);
 584     pB1  = _mm_add_ps(pB1,CB5);
 585     pB0  = _mm_add_ps(pB0,CB4);
 586     pB1  = _mm_mul_ps(pB1,w2);
 587     pB0  = _mm_mul_ps(pB0,w2);
 588     pB1  = _mm_add_ps(pB1,CB3);
 589     pB0  = _mm_add_ps(pB0,CB2);
 590     pB1  = _mm_mul_ps(pB1,w2);
 591     pB0  = _mm_mul_ps(pB0,w2);
 592     pB1  = _mm_add_ps(pB1,CB1);
 593     pB1  = _mm_mul_ps(pB1,w);
 594     pB0  = _mm_add_ps(pB0,pB1);
 595     pB0  = _mm_add_ps(pB0,CB0);
 596
 597     pC0  = _mm_mul_ps(CC10,t2);
 598     pC1  = _mm_mul_ps(CC9,t2);
 599     pC0  = _mm_add_ps(pC0,CC8);
 600     pC1  = _mm_add_ps(pC1,CC7);
 601     pC0  = _mm_mul_ps(pC0,t2);
 602     pC1  = _mm_mul_ps(pC1,t2);
 603     pC0  = _mm_add_ps(pC0,CC6);
 604     pC1  = _mm_add_ps(pC1,CC5);
 605     pC0  = _mm_mul_ps(pC0,t2);
 606     pC1  = _mm_mul_ps(pC1,t2);
 607     pC0  = _mm_add_ps(pC0,CC4);
 608     pC1  = _mm_add_ps(pC1,CC3);
 609     pC0  = _mm_mul_ps(pC0,t2);
 610     pC1  = _mm_mul_ps(pC1,t2);
 611     pC0  = _mm_add_ps(pC0,CC2);
 612     pC1  = _mm_add_ps(pC1,CC1);
 613     pC0  = _mm_mul_ps(pC0,t2);
 614     pC1  = _mm_mul_ps(pC1,t);
 615     pC0  = _mm_add_ps(pC0,pC1);
 616     pC0  = _mm_add_ps(pC0,CC0);
 617     pC0  = _mm_mul_ps(pC0,t);
 618
 619     /* SELECT pB0 or pC0 for erfc() */
 620     mask = _mm_cmplt_ps(two,y);
 621     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 622     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 623
 624     /* erfc(x<0) = 2-erfc(|x|) */
 625     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
 626     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 627
 628     /* Select erf() or erfc() */
 629     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
 630     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
 631
 632     return res;
 633 }
 634
 635
 636 /* Calculate the force correction due to PME analytically.
 637  *
 638  * This routine is meant to enable analytical evaluation of the
 639  * direct-space PME electrostatic force to avoid tables.
 640  *
 641  * The direct-space potential should be Erfc(beta*r)/r, but there
 642  * are some problems evaluating that:
 643  *
 644  * First, the error function is difficult (read: expensive) to
 645  * approxmiate accurately for intermediate to large arguments, and
 646  * this happens already in ranges of beta*r that occur in simulations.
 647  * Second, we now try to avoid calculating potentials in Gromacs but
 648  * use forces directly.
 649  *
 650  * We can simply things slight by noting that the PME part is really
 651  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 652  *
 653  * V= 1/r - Erf(beta*r)/r
 654  *
 655  * The first term we already have from the inverse square root, so
 656  * that we can leave out of this routine.
 657  *
 658  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 659  * the argument beta*r will be in the range 0.15 to ~4. Use your
 660  * favorite plotting program to realize how well-behaved Erf(z)/z is
 661  * in this range!
 662  *
 663  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 664  * However, it turns out it is more efficient to approximate f(z)/z and
 665  * then only use even powers. This is another minor optimization, since
 666  * we actually WANT f(z)/z, because it is going to be multiplied by
 667  * the vector between the two atoms to get the vectorial force. The
 668  * fastest flops are the ones we can avoid calculating!
 669  *
 670  * So, here's how it should be used:
 671  *
 672  * 1. Calculate r^2.
 673  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 674  * 3. Evaluate this routine with z^2 as the argument.
 675  * 4. The return value is the expression:
 676  *
 677  *
 678  *       2*exp(-z^2)     erf(z)
 679  *       ------------ - --------
 680  *       sqrt(Pi)*z^2      z^3
 681  *
 682  * 5. Multiply the entire expression by beta^3. This will get you
 683  *
 684  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 685  *       ------------------  - ---------------
 686  *          sqrt(Pi)*z^2            z^3
 687  *
 688  *    or, switching back to r (z=r*beta):
 689  *
 690  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 691  *       ----------------------- - -----------
 692  *            sqrt(Pi)*r^2            r^3
 693  *
 694  *
 695  *    With a bit of math exercise you should be able to confirm that
 696  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 697  *
 698  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 699  *    and you have your force (divided by r). A final multiplication
 700  *    with the vector connecting the two particles and you have your
 701  *    vectorial force to add to the particles.
 702  *
 703  */
 704 static gmx_inline __m128
 705 gmx_mm_pmecorrF_ps(__m128 z2)
 706 {
 707     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 708     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 709     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 710     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 711     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 712     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 713     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 714
 715     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 716     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 717     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 718     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 719     const __m128  FD0      = _mm_set1_ps(1.0f);
 720
 721     __m128 z4;
 722     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
 723
 724     z4             = _mm_mul_ps(z2,z2);
 725
 726     polyFD0        = _mm_mul_ps(FD4,z4);
 727     polyFD1        = _mm_mul_ps(FD3,z4);
 728     polyFD0        = _mm_add_ps(polyFD0,FD2);
 729     polyFD1        = _mm_add_ps(polyFD1,FD1);
 730     polyFD0        = _mm_mul_ps(polyFD0,z4);
 731     polyFD1        = _mm_mul_ps(polyFD1,z2);
 732     polyFD0        = _mm_add_ps(polyFD0,FD0);
 733     polyFD0        = _mm_add_ps(polyFD0,polyFD1);
 734
 735     polyFD0        = gmx_mm_inv_ps(polyFD0);
 736
 737     polyFN0        = _mm_mul_ps(FN6,z4);
 738     polyFN1        = _mm_mul_ps(FN5,z4);
 739     polyFN0        = _mm_add_ps(polyFN0,FN4);
 740     polyFN1        = _mm_add_ps(polyFN1,FN3);
 741     polyFN0        = _mm_mul_ps(polyFN0,z4);
 742     polyFN1        = _mm_mul_ps(polyFN1,z4);
 743     polyFN0        = _mm_add_ps(polyFN0,FN2);
 744     polyFN1        = _mm_add_ps(polyFN1,FN1);
 745     polyFN0        = _mm_mul_ps(polyFN0,z4);
 746     polyFN1        = _mm_mul_ps(polyFN1,z2);
 747     polyFN0        = _mm_add_ps(polyFN0,FN0);
 748     polyFN0        = _mm_add_ps(polyFN0,polyFN1);
 749
 750     return   _mm_mul_ps(polyFN0,polyFD0);
 751 }
 752
 753
 754 /* Calculate the potential correction due to PME analytically.
 755  *
 756  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 757  *
 758  * This routine calculates Erf(z)/z, although you should provide z^2
 759  * as the input argument.
 760  *
 761  * Here's how it should be used:
 762  *
 763  * 1. Calculate r^2.
 764  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 765  * 3. Evaluate this routine with z^2 as the argument.
 766  * 4. The return value is the expression:
 767  *
 768  *
 769  *        erf(z)
 770  *       --------
 771  *          z
 772  *
 773  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 774  *
 775  *       erf(r*beta)
 776  *       -----------
 777  *           r
 778  *
 779  * 6. Add the result to 1/r, multiply by the product of the charges,
 780  *    and you have your potential.
 781  */
 782 static gmx_inline __m128
 783 gmx_mm_pmecorrV_ps(__m128 z2)
 784 {
 785     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 786     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 787     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 788     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 789     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 790     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 791     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 792
 793     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 794     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 795     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 796     const __m128  VD0      = _mm_set1_ps(1.0f);
 797
 798     __m128 z4;
 799     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
 800
 801     z4             = _mm_mul_ps(z2,z2);
 802
 803     polyVD1        = _mm_mul_ps(VD3,z4);
 804     polyVD0        = _mm_mul_ps(VD2,z4);
 805     polyVD1        = _mm_add_ps(polyVD1,VD1);
 806     polyVD0        = _mm_add_ps(polyVD0,VD0);
 807     polyVD1        = _mm_mul_ps(polyVD1,z2);
 808     polyVD0        = _mm_add_ps(polyVD0,polyVD1);
 809
 810     polyVD0        = gmx_mm_inv_ps(polyVD0);
 811
 812     polyVN0        = _mm_mul_ps(VN6,z4);
 813     polyVN1        = _mm_mul_ps(VN5,z4);
 814     polyVN0        = _mm_add_ps(polyVN0,VN4);
 815     polyVN1        = _mm_add_ps(polyVN1,VN3);
 816     polyVN0        = _mm_mul_ps(polyVN0,z4);
 817     polyVN1        = _mm_mul_ps(polyVN1,z4);
 818     polyVN0        = _mm_add_ps(polyVN0,VN2);
 819     polyVN1        = _mm_add_ps(polyVN1,VN1);
 820     polyVN0        = _mm_mul_ps(polyVN0,z4);
 821     polyVN1        = _mm_mul_ps(polyVN1,z2);
 822     polyVN0        = _mm_add_ps(polyVN0,VN0);
 823     polyVN0        = _mm_add_ps(polyVN0,polyVN1);
 824
 825     return   _mm_mul_ps(polyVN0,polyVD0);
 826 }
 827
 828
 829 static int
 830 gmx_mm_sincos_ps(__m128 x,
 831                  __m128 *sinval,
 832                  __m128 *cosval)
 833 {
 834     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
 835     const __m128 half        = _mm_set1_ps(0.5);
 836     const __m128 one         = _mm_set1_ps(1.0);
 837
 838     const __m128i izero      = _mm_set1_epi32(0);
 839     const __m128i ione       = _mm_set1_epi32(1);
 840     const __m128i itwo       = _mm_set1_epi32(2);
 841     const __m128i ithree     = _mm_set1_epi32(3);
 842     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 843
 844     const __m128 CA1         = _mm_set1_ps(1.5703125f);
 845     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 846     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 847
 848     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
 849     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
 850     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
 851     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
 852     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
 853     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
 854
 855     __m128  y,y2;
 856     __m128  z;
 857     __m128i iz;
 858     __m128i offset_sin,offset_cos;
 859     __m128  tmp1,tmp2;
 860     __m128  mask_sin,mask_cos;
 861     __m128  tmp_sin,tmp_cos;
 862
 863     y          = _mm_mul_ps(x,two_over_pi);
 864     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
 865
 866     iz         = _mm_cvttps_epi32(y);
 867     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
 868
 869     offset_sin = _mm_and_si128(iz,ithree);
 870     offset_cos = _mm_add_epi32(iz,ione);
 871
 872     /* Extended precision arithmethic to achieve full precision */
 873     y               = _mm_mul_ps(z,CA1);
 874     tmp1            = _mm_mul_ps(z,CA2);
 875     tmp2            = _mm_mul_ps(z,CA3);
 876     y               = _mm_sub_ps(x,y);
 877     y               = _mm_sub_ps(y,tmp1);
 878     y               = _mm_sub_ps(y,tmp2);
 879
 880     y2              = _mm_mul_ps(y,y);
 881
 882     tmp1            = _mm_mul_ps(CC0,y2);
 883     tmp1            = _mm_add_ps(tmp1,CC1);
 884     tmp2            = _mm_mul_ps(CS0,y2);
 885     tmp2            = _mm_add_ps(tmp2,CS1);
 886     tmp1            = _mm_mul_ps(tmp1,y2);
 887     tmp1            = _mm_add_ps(tmp1,CC2);
 888     tmp2            = _mm_mul_ps(tmp2,y2);
 889     tmp2            = _mm_add_ps(tmp2,CS2);
 890
 891     tmp1            = _mm_mul_ps(tmp1,y2);
 892     tmp1            = _mm_add_ps(tmp1,one);
 893
 894     tmp2            = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
 895     tmp2            = _mm_add_ps(tmp2,y);
 896
 897     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
 898     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
 899
 900     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
 901     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
 902
 903     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
 904     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
 905
 906     tmp1            = _mm_xor_ps(signbit,tmp_sin);
 907     tmp2            = _mm_xor_ps(signbit,tmp_cos);
 908
 909     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
 910     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
 911
 912     return 0;
 913 }
 914
 915 /*
 916  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 917  * will then call the sincos() routine and waste a factor 2 in performance!
 918  */
 919 static __m128
 920 gmx_mm_sin_ps(__m128 x)
 921 {
 922     __m128 s,c;
 923     gmx_mm_sincos_ps(x,&s,&c);
 924     return s;
 925 }
 926
 927 /*
 928  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 929  * will then call the sincos() routine and waste a factor 2 in performance!
 930  */
 931 static __m128
 932 gmx_mm_cos_ps(__m128 x)
 933 {
 934     __m128 s,c;
 935     gmx_mm_sincos_ps(x,&s,&c);
 936     return c;
 937 }
 938
 939
 940 static __m128
 941 gmx_mm_tan_ps(__m128 x)
 942 {
 943     __m128 sinval,cosval;
 944     __m128 tanval;
 945
 946     gmx_mm_sincos_ps(x,&sinval,&cosval);
 947
 948     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
 949
 950     return tanval;
 951 }
 952
 953
 954 static __m128
 955 gmx_mm_asin_ps(__m128 x)
 956 {
 957     /* Same algorithm as cephes library */
 958     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 959     const __m128 limitlow  = _mm_set1_ps(1e-4f);
 960     const __m128 half      = _mm_set1_ps(0.5f);
 961     const __m128 one       = _mm_set1_ps(1.0f);
 962     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 963
 964     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 965     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 966     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 967     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 968     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 969
 970     __m128 sign;
 971     __m128 mask;
 972     __m128 xabs;
 973     __m128 z,z1,z2,q,q1,q2;
 974     __m128 pA,pB;
 975
 976     sign  = _mm_andnot_ps(signmask,x);
 977     xabs  = _mm_and_ps(x,signmask);
 978
 979     mask  = _mm_cmpgt_ps(xabs,half);
 980
 981     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
 982     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
 983     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
 984
 985     q2    = xabs;
 986     z2    = _mm_mul_ps(q2,q2);
 987
 988     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
 989     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
 990
 991     z2    = _mm_mul_ps(z,z);
 992
 993     pA    = _mm_mul_ps(CC5,z2);
 994     pB    = _mm_mul_ps(CC4,z2);
 995
 996     pA    = _mm_add_ps(pA,CC3);
 997     pB    = _mm_add_ps(pB,CC2);
 998
 999     pA    = _mm_mul_ps(pA,z2);
1000     pB    = _mm_mul_ps(pB,z2);
1001
1002     pA    = _mm_add_ps(pA,CC1);
1003     pA    = _mm_mul_ps(pA,z);
1004
1005     z     = _mm_add_ps(pA,pB);
1006     z     = _mm_mul_ps(z,q);
1007     z     = _mm_add_ps(z,q);
1008
1009     q2    = _mm_sub_ps(halfpi,z);
1010     q2    = _mm_sub_ps(q2,z);
1011
1012     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1013
1014     mask  = _mm_cmpgt_ps(xabs,limitlow);
1015     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1016
1017     z = _mm_xor_ps(z,sign);
1018
1019     return z;
1020 }
1021
1022
1023 static __m128
1024 gmx_mm_acos_ps(__m128 x)
1025 {
1026     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1027     const __m128 one_ps    = _mm_set1_ps(1.0f);
1028     const __m128 half_ps   = _mm_set1_ps(0.5f);
1029     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1030     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1031
1032     __m128 mask1;
1033     __m128 mask2;
1034     __m128 xabs;
1035     __m128 z,z1,z2,z3;
1036
1037     xabs  = _mm_and_ps(x,signmask);
1038     mask1 = _mm_cmpgt_ps(xabs,half_ps);
1039     mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1040
1041     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
1042     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
1043     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
1044
1045     z     = _mm_blendv_ps(x,z,mask1);
1046     z     = gmx_mm_asin_ps(z);
1047
1048     z2    = _mm_add_ps(z,z);
1049     z1    = _mm_sub_ps(pi_ps,z2);
1050     z3    = _mm_sub_ps(halfpi_ps,z);
1051
1052     z     = _mm_blendv_ps(z1,z2,mask2);
1053     z     = _mm_blendv_ps(z3,z,mask1);
1054
1055     return z;
1056 }
1057
1058
1059 static __m128
1060 gmx_mm_atan_ps(__m128 x)
1061 {
1062     /* Same algorithm as cephes library */
1063     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1064     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
1065     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
1066     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
1067     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
1068     const __m128 mone      = _mm_set1_ps(-1.0f);
1069     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
1070     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
1071     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
1072     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
1073
1074     __m128 sign;
1075     __m128 mask1,mask2;
1076     __m128 y,z1,z2;
1077     __m128 x2,x4;
1078     __m128 sum1,sum2;
1079
1080     sign  = _mm_andnot_ps(signmask,x);
1081     x     = _mm_and_ps(x,signmask);
1082
1083     mask1 = _mm_cmpgt_ps(x,limit1);
1084     mask2 = _mm_cmpgt_ps(x,limit2);
1085
1086     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
1087     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
1088
1089     y     = _mm_and_ps(mask1,quarterpi);
1090     y     = _mm_blendv_ps(y,halfpi,mask2);
1091
1092     x     = _mm_blendv_ps(x,z1,mask1);
1093     x     = _mm_blendv_ps(x,z2,mask2);
1094
1095     x2    = _mm_mul_ps(x,x);
1096     x4    = _mm_mul_ps(x2,x2);
1097
1098     sum1  = _mm_mul_ps(CC9,x4);
1099     sum2  = _mm_mul_ps(CC7,x4);
1100     sum1  = _mm_add_ps(sum1,CC5);
1101     sum2  = _mm_add_ps(sum2,CC3);
1102     sum1  = _mm_mul_ps(sum1,x4);
1103     sum2  = _mm_mul_ps(sum2,x2);
1104
1105     sum1  = _mm_add_ps(sum1,sum2);
1106     sum1  = _mm_sub_ps(sum1,mone);
1107     sum1  = _mm_mul_ps(sum1,x);
1108     y     = _mm_add_ps(y,sum1);
1109
1110     y     = _mm_xor_ps(y,sign);
1111
1112     return y;
1113 }
1114
1115
1116 static __m128
1117 gmx_mm_atan2_ps(__m128 y, __m128 x)
1118 {
1119     const __m128 pi          = _mm_set1_ps(M_PI);
1120     const __m128 minuspi     = _mm_set1_ps(-M_PI);
1121     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
1122     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
1123
1124     __m128 z,z1,z3,z4;
1125     __m128 w;
1126     __m128 maskx_lt,maskx_eq;
1127     __m128 masky_lt,masky_eq;
1128     __m128 mask1,mask2,mask3,mask4,maskall;
1129
1130     maskx_lt  = _mm_cmplt_ps(x,_mm_setzero_ps());
1131     masky_lt  = _mm_cmplt_ps(y,_mm_setzero_ps());
1132     maskx_eq  = _mm_cmpeq_ps(x,_mm_setzero_ps());
1133     masky_eq  = _mm_cmpeq_ps(y,_mm_setzero_ps());
1134
1135     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
1136     z         = gmx_mm_atan_ps(z);
1137
1138     mask1     = _mm_and_ps(maskx_eq,masky_lt);
1139     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
1140     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
1141     mask4     = _mm_and_ps(masky_eq,maskx_lt);
1142
1143     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
1144
1145     z         = _mm_andnot_ps(maskall,z);
1146     z1        = _mm_and_ps(mask1,minushalfpi);
1147     z3        = _mm_and_ps(mask3,halfpi);
1148     z4        = _mm_and_ps(mask4,pi);
1149
1150     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
1151
1152     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
1153     mask2     = _mm_and_ps(maskx_lt,masky_lt);
1154
1155     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
1156     w         = _mm_andnot_ps(maskall,w);
1157
1158     z         = _mm_add_ps(z,w);
1159
1160     return z;
1161 }
1162
1163
1164
1165 #endif /* _gmx_math_x86_sse4_1_single_h_ */