include/gmx_math_x86_avx_256_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012, by the GROMACS development team, led by
   5  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   6  * others, as listed in the AUTHORS file in the top-level source
   7  * directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _gmx_math_x86_avx_256_single_h_
  36 #define _gmx_math_x86_avx_256_single_h_
  37
  38 #include "gmx_x86_avx_256.h"
  39
  40
  41 #ifndef M_PI
  42 #  define M_PI 3.14159265358979323846264338327950288
  43 #endif
  44
  45
  46
  47 /************************
  48  *                      *
  49  * Simple math routines *
  50  *                      *
  51  ************************/
  52
  53 /* 1.0/sqrt(x), 256-bit wide version */
  54 static gmx_inline __m256
  55 gmx_mm256_invsqrt_ps(__m256 x)
  56 {
  57     const __m256 half  = _mm256_set1_ps(0.5f);
  58     const __m256 three = _mm256_set1_ps(3.0f);
  59
  60     __m256 lu = _mm256_rsqrt_ps(x);
  61
  62     return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
  63 }
  64
  65 /* 1.0/sqrt(x), 128-bit wide version */
  66 static gmx_inline __m128
  67 gmx_mm_invsqrt_ps(__m128 x)
  68 {
  69     const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
  70     const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
  71
  72     __m128 lu = _mm_rsqrt_ps(x);
  73
  74     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
  75 }
  76
  77
  78 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  79 static gmx_inline __m256
  80 gmx_mm256_sqrt_ps(__m256 x)
  81 {
  82     __m256 mask;
  83     __m256 res;
  84
  85     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
  86     res  = _mm256_andnot_ps(mask,gmx_mm256_invsqrt_ps(x));
  87
  88     res  = _mm256_mul_ps(x,res);
  89
  90     return res;
  91 }
  92
  93 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  94 static gmx_inline __m128
  95 gmx_mm_sqrt_ps(__m128 x)
  96 {
  97     __m128 mask;
  98     __m128 res;
  99
 100     mask = _mm_cmpeq_ps(x,_mm_setzero_ps());
 101     res  = _mm_andnot_ps(mask,gmx_mm_invsqrt_ps(x));
 102
 103     res  = _mm_mul_ps(x,res);
 104
 105     return res;
 106 }
 107
 108
 109 /* 1.0/x, 256-bit wide */
 110 static gmx_inline __m256
 111 gmx_mm256_inv_ps(__m256 x)
 112 {
 113     const __m256 two = _mm256_set1_ps(2.0f);
 114
 115     __m256 lu = _mm256_rcp_ps(x);
 116
 117     return _mm256_mul_ps(lu,_mm256_sub_ps(two,_mm256_mul_ps(lu,x)));
 118 }
 119
 120 /* 1.0/x, 128-bit wide */
 121 static gmx_inline __m128
 122 gmx_mm_inv_ps(__m128 x)
 123 {
 124     const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
 125
 126     __m128 lu = _mm_rcp_ps(x);
 127
 128     return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
 129 }
 130
 131
 132 static gmx_inline __m256
 133 gmx_mm256_abs_ps(__m256 x)
 134 {
 135     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
 136
 137     return _mm256_and_ps(x,signmask);
 138 }
 139
 140 static gmx_inline __m128
 141 gmx_mm_abs_ps(__m128 x)
 142 {
 143     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 144
 145     return _mm_and_ps(x,signmask);
 146 }
 147
 148
 149 static __m256
 150 gmx_mm256_log_ps(__m256 x)
 151 {
 152     const __m256  expmask    = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
 153     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 154     const __m256  half       = _mm256_set1_ps(0.5f);
 155     const __m256  one        = _mm256_set1_ps(1.0f);
 156     const __m256  invsq2     = _mm256_set1_ps(1.0f/sqrt(2.0f));
 157     const __m256  corr1      = _mm256_set1_ps(-2.12194440e-4f);
 158     const __m256  corr2      = _mm256_set1_ps(0.693359375f);
 159
 160     const __m256 CA_1        = _mm256_set1_ps(0.070376836292f);
 161     const __m256 CB_0        = _mm256_set1_ps(1.6714950086782716f);
 162     const __m256 CB_1        = _mm256_set1_ps(-2.452088066061482f);
 163     const __m256 CC_0        = _mm256_set1_ps(1.5220770854701728f);
 164     const __m256 CC_1        = _mm256_set1_ps(-1.3422238433233642f);
 165     const __m256 CD_0        = _mm256_set1_ps(1.386218787509749f);
 166     const __m256 CD_1        = _mm256_set1_ps(0.35075468953796346f);
 167     const __m256 CE_0        = _mm256_set1_ps(1.3429983063133937f);
 168     const __m256 CE_1        = _mm256_set1_ps(1.807420826584643f);
 169
 170     __m256  fexp,fexp1;
 171     __m256i iexp;
 172     __m128i iexp128a,iexp128b;
 173     __m256  mask;
 174     __m256i imask;
 175     __m128i imask128a,imask128b;
 176     __m256  x1,x2;
 177     __m256  y;
 178     __m256  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 179
 180     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 181     fexp  = _mm256_and_ps(x,expmask);
 182     iexp  = _mm256_castps_si256(fexp);
 183
 184     iexp128b = _mm256_extractf128_si256(iexp,0x1);
 185     iexp128a = _mm256_castsi256_si128(iexp);
 186
 187     iexp128a  = _mm_srli_epi32(iexp128a,23);
 188     iexp128b  = _mm_srli_epi32(iexp128b,23);
 189     iexp128a  = _mm_sub_epi32(iexp128a,expbase_m1);
 190     iexp128b  = _mm_sub_epi32(iexp128b,expbase_m1);
 191
 192     x     = _mm256_andnot_ps(expmask,x);
 193     x     = _mm256_or_ps(x,one);
 194     x     = _mm256_mul_ps(x,half);
 195
 196     mask  = _mm256_cmp_ps(x,invsq2,_CMP_LT_OQ);
 197
 198     x     = _mm256_add_ps(x,_mm256_and_ps(mask,x));
 199     x     = _mm256_sub_ps(x,one);
 200
 201     imask = _mm256_castps_si256(mask);
 202
 203     imask128b = _mm256_extractf128_si256(imask,0x1);
 204     imask128a = _mm256_castsi256_si128(imask);
 205
 206     iexp128a  = _mm_add_epi32(iexp128a,imask128a);
 207     iexp128b  = _mm_add_epi32(iexp128b,imask128b);
 208
 209     iexp  = _mm256_castsi128_si256(iexp128a);
 210     iexp  = _mm256_insertf128_si256(iexp,iexp128b,0x1);
 211
 212     x2    = _mm256_mul_ps(x,x);
 213
 214     pA    = _mm256_mul_ps(CA_1,x);
 215     pB    = _mm256_mul_ps(CB_1,x);
 216     pC    = _mm256_mul_ps(CC_1,x);
 217     pD    = _mm256_mul_ps(CD_1,x);
 218     pE    = _mm256_mul_ps(CE_1,x);
 219     tB    = _mm256_add_ps(CB_0,x2);
 220     tC    = _mm256_add_ps(CC_0,x2);
 221     tD    = _mm256_add_ps(CD_0,x2);
 222     tE    = _mm256_add_ps(CE_0,x2);
 223     pB    = _mm256_add_ps(pB,tB);
 224     pC    = _mm256_add_ps(pC,tC);
 225     pD    = _mm256_add_ps(pD,tD);
 226     pE    = _mm256_add_ps(pE,tE);
 227
 228     pA    = _mm256_mul_ps(pA,pB);
 229     pC    = _mm256_mul_ps(pC,pD);
 230     pE    = _mm256_mul_ps(pE,x2);
 231     pA    = _mm256_mul_ps(pA,pC);
 232     y     = _mm256_mul_ps(pA,pE);
 233
 234     fexp  = _mm256_cvtepi32_ps(iexp);
 235     y     = _mm256_add_ps(y,_mm256_mul_ps(fexp,corr1));
 236
 237     y     = _mm256_sub_ps(y, _mm256_mul_ps(half,x2));
 238     x2    = _mm256_add_ps(x,y);
 239
 240     x2    = _mm256_add_ps(x2,_mm256_mul_ps(fexp,corr2));
 241
 242     return x2;
 243 }
 244
 245
 246 static __m128
 247 gmx_mm_log_ps(__m128 x)
 248 {
 249     /* Same algorithm as cephes library */
 250     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 251     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 252     const __m128  half       = _mm_set1_ps(0.5f);
 253     const __m128  one        = _mm_set1_ps(1.0f);
 254     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 255     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 256     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 257
 258     const __m128 CA_1        = _mm_set1_ps(0.070376836292f);
 259     const __m128 CB_0        = _mm_set1_ps(1.6714950086782716f);
 260     const __m128 CB_1        = _mm_set1_ps(-2.452088066061482f);
 261     const __m128 CC_0        = _mm_set1_ps(1.5220770854701728f);
 262     const __m128 CC_1        = _mm_set1_ps(-1.3422238433233642f);
 263     const __m128 CD_0        = _mm_set1_ps(1.386218787509749f);
 264     const __m128 CD_1        = _mm_set1_ps(0.35075468953796346f);
 265     const __m128 CE_0        = _mm_set1_ps(1.3429983063133937f);
 266     const __m128 CE_1        = _mm_set1_ps(1.807420826584643f);
 267
 268     __m128  fexp,fexp1;
 269     __m128i iexp;
 270     __m128  mask;
 271     __m128  x1,x2;
 272     __m128  y;
 273     __m128  pA,pB,pC,pD,pE,tB,tC,tD,tE;
 274
 275     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 276     fexp  = _mm_and_ps(x,expmask);
 277     iexp  = gmx_mm_castps_si128(fexp);
 278     iexp  = _mm_srli_epi32(iexp,23);
 279     iexp  = _mm_sub_epi32(iexp,expbase_m1);
 280
 281     x     = _mm_andnot_ps(expmask,x);
 282     x     = _mm_or_ps(x,one);
 283     x     = _mm_mul_ps(x,half);
 284
 285     mask  = _mm_cmplt_ps(x,invsq2);
 286
 287     x     = _mm_add_ps(x,_mm_and_ps(mask,x));
 288     x     = _mm_sub_ps(x,one);
 289     iexp  = _mm_add_epi32(iexp,gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 290
 291     x2    = _mm_mul_ps(x,x);
 292
 293     pA    = _mm_mul_ps(CA_1,x);
 294     pB    = _mm_mul_ps(CB_1,x);
 295     pC    = _mm_mul_ps(CC_1,x);
 296     pD    = _mm_mul_ps(CD_1,x);
 297     pE    = _mm_mul_ps(CE_1,x);
 298     tB    = _mm_add_ps(CB_0,x2);
 299     tC    = _mm_add_ps(CC_0,x2);
 300     tD    = _mm_add_ps(CD_0,x2);
 301     tE    = _mm_add_ps(CE_0,x2);
 302     pB    = _mm_add_ps(pB,tB);
 303     pC    = _mm_add_ps(pC,tC);
 304     pD    = _mm_add_ps(pD,tD);
 305     pE    = _mm_add_ps(pE,tE);
 306
 307     pA    = _mm_mul_ps(pA,pB);
 308     pC    = _mm_mul_ps(pC,pD);
 309     pE    = _mm_mul_ps(pE,x2);
 310     pA    = _mm_mul_ps(pA,pC);
 311     y     = _mm_mul_ps(pA,pE);
 312
 313     fexp  = _mm_cvtepi32_ps(iexp);
 314     y     = _mm_add_ps(y,_mm_mul_ps(fexp,corr1));
 315
 316     y     = _mm_sub_ps(y, _mm_mul_ps(half,x2));
 317     x2    = _mm_add_ps(x,y);
 318
 319     x2    = _mm_add_ps(x2,_mm_mul_ps(fexp,corr2));
 320
 321     return x2;
 322 }
 323
 324
 325 /*
 326  * 2^x function, 256-bit wide
 327  *
 328  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 329  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 330  *
 331  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 332  *
 333  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 334  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 335  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 336  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 337  * number instead. That would take a few extra cycles and not really help, since something is
 338  * wrong if you are using single precision to work with numbers that cannot really be represented
 339  * in single precision.
 340  *
 341  * The accuracy is at least 23 bits.
 342  */
 343 static __m256
 344 gmx_mm256_exp2_ps(__m256 x)
 345 {
 346     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 347     const __m256  arglimit = _mm256_set1_ps(126.0f);
 348
 349     const __m128i expbase  = _mm_set1_epi32(127);
 350     const __m256  CC6      = _mm256_set1_ps(1.535336188319500E-004);
 351     const __m256  CC5      = _mm256_set1_ps(1.339887440266574E-003);
 352     const __m256  CC4      = _mm256_set1_ps(9.618437357674640E-003);
 353     const __m256  CC3      = _mm256_set1_ps(5.550332471162809E-002);
 354     const __m256  CC2      = _mm256_set1_ps(2.402264791363012E-001);
 355     const __m256  CC1      = _mm256_set1_ps(6.931472028550421E-001);
 356     const __m256  CC0      = _mm256_set1_ps(1.0f);
 357
 358     __m256  p0,p1;
 359     __m256  valuemask;
 360     __m256i iexppart;
 361     __m128i iexppart128a,iexppart128b;
 362     __m256  fexppart;
 363     __m256  intpart;
 364     __m256  x2;
 365
 366
 367     iexppart  = _mm256_cvtps_epi32(x);
 368     intpart   = _mm256_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 369
 370     iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
 371     iexppart128a = _mm256_castsi256_si128(iexppart);
 372
 373     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
 374     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
 375
 376     iexppart  = _mm256_castsi128_si256(iexppart128a);
 377     iexppart  = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
 378     valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(x),_CMP_GE_OQ);
 379     fexppart  = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
 380
 381     x         = _mm256_sub_ps(x,intpart);
 382     x2        = _mm256_mul_ps(x,x);
 383
 384     p0        = _mm256_mul_ps(CC6,x2);
 385     p1        = _mm256_mul_ps(CC5,x2);
 386     p0        = _mm256_add_ps(p0,CC4);
 387     p1        = _mm256_add_ps(p1,CC3);
 388     p0        = _mm256_mul_ps(p0,x2);
 389     p1        = _mm256_mul_ps(p1,x2);
 390     p0        = _mm256_add_ps(p0,CC2);
 391     p1        = _mm256_add_ps(p1,CC1);
 392     p0        = _mm256_mul_ps(p0,x2);
 393     p1        = _mm256_mul_ps(p1,x);
 394     p0        = _mm256_add_ps(p0,CC0);
 395     p0        = _mm256_add_ps(p0,p1);
 396     x         = _mm256_mul_ps(p0,fexppart);
 397
 398     return  x;
 399 }
 400
 401
 402 /* 2^x, 128 bit wide */
 403 static __m128
 404 gmx_mm_exp2_ps(__m128 x)
 405 {
 406     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 407     const __m128  arglimit = _mm_set1_ps(126.0f);
 408
 409     const __m128i expbase  = _mm_set1_epi32(127);
 410     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 411     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 412     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 413     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 414     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 415     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 416     const __m128  CA0      = _mm_set1_ps(1.0f);
 417
 418     __m128  valuemask;
 419     __m128i iexppart;
 420     __m128  fexppart;
 421     __m128  intpart;
 422     __m128  x2;
 423     __m128  p0,p1;
 424
 425     iexppart  = _mm_cvtps_epi32(x);
 426     intpart   = _mm_round_ps(x,_MM_FROUND_TO_NEAREST_INT);
 427     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 428     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(x));
 429     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 430
 431     x         = _mm_sub_ps(x,intpart);
 432     x2        = _mm_mul_ps(x,x);
 433
 434     p0        = _mm_mul_ps(CA6,x2);
 435     p1        = _mm_mul_ps(CA5,x2);
 436     p0        = _mm_add_ps(p0,CA4);
 437     p1        = _mm_add_ps(p1,CA3);
 438     p0        = _mm_mul_ps(p0,x2);
 439     p1        = _mm_mul_ps(p1,x2);
 440     p0        = _mm_add_ps(p0,CA2);
 441     p1        = _mm_add_ps(p1,CA1);
 442     p0        = _mm_mul_ps(p0,x2);
 443     p1        = _mm_mul_ps(p1,x);
 444     p0        = _mm_add_ps(p0,CA0);
 445     p0        = _mm_add_ps(p0,p1);
 446     x         = _mm_mul_ps(p0,fexppart);
 447
 448     return  x;
 449 }
 450
 451
 452 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
 453  * where y=log2(e)*x, but there will then be a small rounding error since we lose some
 454  * precision due to the multiplication. This will then be magnified a lot by the exponential.
 455  *
 456  * Instead, we calculate the fractional part directly as a minimax approximation of
 457  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 458  * remaining after 2^y, which avoids the precision-loss.
 459  * The final result is correct to within 1 LSB over the entire argument range.
 460  */
 461 static __m256
 462 gmx_mm256_exp_ps(__m256 exparg)
 463 {
 464     const __m256  argscale      = _mm256_set1_ps(1.44269504088896341f);
 465     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 466     const __m256  arglimit      = _mm256_set1_ps(126.0f);
 467     const __m128i expbase       = _mm_set1_epi32(127);
 468
 469     const __m256  invargscale0  = _mm256_set1_ps(0.693359375f);
 470     const __m256  invargscale1  = _mm256_set1_ps(-2.12194440e-4f);
 471
 472     const __m256  CE5           = _mm256_set1_ps(1.9875691500e-4f);
 473     const __m256  CE4           = _mm256_set1_ps(1.3981999507e-3f);
 474     const __m256  CE3           = _mm256_set1_ps(8.3334519073e-3f);
 475     const __m256  CE2           = _mm256_set1_ps(4.1665795894e-2f);
 476     const __m256  CE1           = _mm256_set1_ps(1.6666665459e-1f);
 477     const __m256  CE0           = _mm256_set1_ps(5.0000001201e-1f);
 478     const __m256  one           = _mm256_set1_ps(1.0f);
 479
 480     __m256  exparg2,exp2arg;
 481     __m256  pE0,pE1;
 482     __m256  valuemask;
 483     __m256i iexppart;
 484     __m128i iexppart128a,iexppart128b;
 485     __m256  fexppart;
 486     __m256  intpart;
 487
 488     exp2arg = _mm256_mul_ps(exparg,argscale);
 489
 490     iexppart  = _mm256_cvtps_epi32(exp2arg);
 491     intpart   = _mm256_round_ps(exp2arg,_MM_FROUND_TO_NEAREST_INT);
 492
 493     iexppart128b = _mm256_extractf128_si256(iexppart,0x1);
 494     iexppart128a = _mm256_castsi256_si128(iexppart);
 495
 496     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a,expbase),23);
 497     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b,expbase),23);
 498
 499     iexppart  = _mm256_castsi128_si256(iexppart128a);
 500     iexppart  = _mm256_insertf128_si256(iexppart,iexppart128b,0x1);
 501     valuemask = _mm256_cmp_ps(arglimit,gmx_mm256_abs_ps(exp2arg),_CMP_GE_OQ);
 502     fexppart  = _mm256_and_ps(valuemask,_mm256_castsi256_ps(iexppart));
 503
 504     /* Extended precision arithmetics */
 505     exparg    = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale0,intpart));
 506     exparg    = _mm256_sub_ps(exparg,_mm256_mul_ps(invargscale1,intpart));
 507
 508     exparg2   = _mm256_mul_ps(exparg,exparg);
 509
 510     pE1       = _mm256_mul_ps(CE5,exparg2);
 511     pE0       = _mm256_mul_ps(CE4,exparg2);
 512     pE1       = _mm256_add_ps(pE1,CE3);
 513     pE0       = _mm256_add_ps(pE0,CE2);
 514     pE1       = _mm256_mul_ps(pE1,exparg2);
 515     pE0       = _mm256_mul_ps(pE0,exparg2);
 516     pE1       = _mm256_add_ps(pE1,CE1);
 517     pE0       = _mm256_add_ps(pE0,CE0);
 518     pE1       = _mm256_mul_ps(pE1,exparg);
 519     pE0       = _mm256_add_ps(pE0,pE1);
 520     pE0       = _mm256_mul_ps(pE0,exparg2);
 521     exparg    = _mm256_add_ps(exparg,one);
 522     exparg    = _mm256_add_ps(exparg,pE0);
 523
 524     exparg    = _mm256_mul_ps(exparg,fexppart);
 525
 526     return exparg;
 527 }
 528
 529
 530 /* exp(), 128 bit wide. */
 531 static __m128
 532 gmx_mm_exp_ps(__m128 x)
 533 {
 534     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 535     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 536     const __m128  arglimit      = _mm_set1_ps(126.0f);
 537     const __m128i expbase       = _mm_set1_epi32(127);
 538
 539     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 540     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 541
 542     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 543     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 544     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 545     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 546     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 547     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 548     const __m128  one           = _mm_set1_ps(1.0f);
 549
 550     __m128  y,x2;
 551     __m128  p0,p1;
 552     __m128  valuemask;
 553     __m128i iexppart;
 554     __m128  fexppart;
 555     __m128  intpart;
 556
 557     y = _mm_mul_ps(x,argscale);
 558
 559     iexppart  = _mm_cvtps_epi32(y);
 560     intpart   = _mm_round_ps(y,_MM_FROUND_TO_NEAREST_INT);
 561
 562     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart,expbase),23);
 563     valuemask = _mm_cmpge_ps(arglimit,gmx_mm_abs_ps(y));
 564     fexppart  = _mm_and_ps(valuemask,gmx_mm_castsi128_ps(iexppart));
 565
 566     /* Extended precision arithmetics */
 567     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale0,intpart));
 568     x         = _mm_sub_ps(x,_mm_mul_ps(invargscale1,intpart));
 569
 570     x2        = _mm_mul_ps(x,x);
 571
 572     p1        = _mm_mul_ps(CC5,x2);
 573     p0        = _mm_mul_ps(CC4,x2);
 574     p1        = _mm_add_ps(p1,CC3);
 575     p0        = _mm_add_ps(p0,CC2);
 576     p1        = _mm_mul_ps(p1,x2);
 577     p0        = _mm_mul_ps(p0,x2);
 578     p1        = _mm_add_ps(p1,CC1);
 579     p0        = _mm_add_ps(p0,CC0);
 580     p1        = _mm_mul_ps(p1,x);
 581     p0        = _mm_add_ps(p0,p1);
 582     p0        = _mm_mul_ps(p0,x2);
 583     x         = _mm_add_ps(x,one);
 584     x         = _mm_add_ps(x,p0);
 585
 586     x         = _mm_mul_ps(x,fexppart);
 587
 588     return x;
 589 }
 590
 591
 592
 593 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
 594 static __m256
 595 gmx_mm256_erf_ps(__m256 x)
 596 {
 597     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 598     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 599     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 600     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 601     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 602     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 603     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 604     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 605     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 606     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 607     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 608     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 609     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 610     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 611     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 612     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 613     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 614     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 615     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 616     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 617     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 618     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 619     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 620     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 621     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 622     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 623     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 624     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 625     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 626     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 627     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 628
 629     /* Coefficients for expansion of exp(x) in [0,0.1] */
 630     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 631     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 632     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 633     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 634
 635     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 636     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 637     const __m256  one      = _mm256_set1_ps(1.0f);
 638     const __m256  two      = _mm256_set1_ps(2.0f);
 639
 640     __m256 x2,x4,y;
 641     __m256 z,q,t,t2,w,w2;
 642     __m256 pA0,pA1,pB0,pB1,pC0,pC1;
 643     __m256 expmx2,corr;
 644     __m256 res_erf,res_erfc,res;
 645     __m256 mask;
 646
 647     /* Calculate erf() */
 648     x2     = _mm256_mul_ps(x,x);
 649     x4     = _mm256_mul_ps(x2,x2);
 650
 651     pA0  = _mm256_mul_ps(CA6,x4);
 652     pA1  = _mm256_mul_ps(CA5,x4);
 653     pA0  = _mm256_add_ps(pA0,CA4);
 654     pA1  = _mm256_add_ps(pA1,CA3);
 655     pA0  = _mm256_mul_ps(pA0,x4);
 656     pA1  = _mm256_mul_ps(pA1,x4);
 657     pA0  = _mm256_add_ps(pA0,CA2);
 658     pA1  = _mm256_add_ps(pA1,CA1);
 659     pA0  = _mm256_mul_ps(pA0,x4);
 660     pA1  = _mm256_mul_ps(pA1,x2);
 661     pA0  = _mm256_add_ps(pA0,pA1);
 662     pA0  = _mm256_add_ps(pA0,CA0);
 663
 664     res_erf = _mm256_mul_ps(x,pA0);
 665
 666     /* Calculate erfc */
 667
 668     y       = gmx_mm256_abs_ps(x);
 669     t       = gmx_mm256_inv_ps(y);
 670     w       = _mm256_sub_ps(t,one);
 671     t2      = _mm256_mul_ps(t,t);
 672     w2      = _mm256_mul_ps(w,w);
 673     /*
 674      * We cannot simply calculate exp(-x2) directly in single precision, since
 675      * that will lose a couple of bits of precision due to the multiplication.
 676      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 677      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 678      *
 679      * The only drawback with this is that it requires TWO separate exponential
 680      * evaluations, which would be horrible performance-wise. However, the argument
 681      * for the second exp() call is always small, so there we simply use a
 682      * low-order minimax expansion on [0,0.1].
 683      */
 684
 685     z       = _mm256_and_ps(y,sieve);
 686     q       = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
 687
 688     corr    = _mm256_mul_ps(CD4,q);
 689     corr    = _mm256_add_ps(corr,CD3);
 690     corr    = _mm256_mul_ps(corr,q);
 691     corr    = _mm256_add_ps(corr,CD2);
 692     corr    = _mm256_mul_ps(corr,q);
 693     corr    = _mm256_add_ps(corr,one);
 694     corr    = _mm256_mul_ps(corr,q);
 695     corr    = _mm256_add_ps(corr,one);
 696
 697     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
 698     expmx2  = _mm256_mul_ps(expmx2,corr);
 699
 700     pB1  = _mm256_mul_ps(CB9,w2);
 701     pB0  = _mm256_mul_ps(CB8,w2);
 702     pB1  = _mm256_add_ps(pB1,CB7);
 703     pB0  = _mm256_add_ps(pB0,CB6);
 704     pB1  = _mm256_mul_ps(pB1,w2);
 705     pB0  = _mm256_mul_ps(pB0,w2);
 706     pB1  = _mm256_add_ps(pB1,CB5);
 707     pB0  = _mm256_add_ps(pB0,CB4);
 708     pB1  = _mm256_mul_ps(pB1,w2);
 709     pB0  = _mm256_mul_ps(pB0,w2);
 710     pB1  = _mm256_add_ps(pB1,CB3);
 711     pB0  = _mm256_add_ps(pB0,CB2);
 712     pB1  = _mm256_mul_ps(pB1,w2);
 713     pB0  = _mm256_mul_ps(pB0,w2);
 714     pB1  = _mm256_add_ps(pB1,CB1);
 715     pB1  = _mm256_mul_ps(pB1,w);
 716     pB0  = _mm256_add_ps(pB0,pB1);
 717     pB0  = _mm256_add_ps(pB0,CB0);
 718
 719     pC0  = _mm256_mul_ps(CC10,t2);
 720     pC1  = _mm256_mul_ps(CC9,t2);
 721     pC0  = _mm256_add_ps(pC0,CC8);
 722     pC1  = _mm256_add_ps(pC1,CC7);
 723     pC0  = _mm256_mul_ps(pC0,t2);
 724     pC1  = _mm256_mul_ps(pC1,t2);
 725     pC0  = _mm256_add_ps(pC0,CC6);
 726     pC1  = _mm256_add_ps(pC1,CC5);
 727     pC0  = _mm256_mul_ps(pC0,t2);
 728     pC1  = _mm256_mul_ps(pC1,t2);
 729     pC0  = _mm256_add_ps(pC0,CC4);
 730     pC1  = _mm256_add_ps(pC1,CC3);
 731     pC0  = _mm256_mul_ps(pC0,t2);
 732     pC1  = _mm256_mul_ps(pC1,t2);
 733     pC0  = _mm256_add_ps(pC0,CC2);
 734     pC1  = _mm256_add_ps(pC1,CC1);
 735     pC0  = _mm256_mul_ps(pC0,t2);
 736     pC1  = _mm256_mul_ps(pC1,t);
 737     pC0  = _mm256_add_ps(pC0,pC1);
 738     pC0  = _mm256_add_ps(pC0,CC0);
 739     pC0  = _mm256_mul_ps(pC0,t);
 740
 741     /* SELECT pB0 or pC0 for erfc() */
 742     mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
 743     res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
 744     res_erfc = _mm256_mul_ps(res_erfc,expmx2);
 745
 746     /* erfc(x<0) = 2-erfc(|x|) */
 747     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
 748     res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
 749
 750     /* Select erf() or erfc() */
 751     mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
 752     res  = _mm256_blendv_ps(_mm256_sub_ps(one,res_erfc),res_erf,mask);
 753
 754     return res;
 755 }
 756
 757
 758 /* erf(), 128 bit wide */
 759 static __m128
 760 gmx_mm_erf_ps(__m128 x)
 761 {
 762     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 763     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 764     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 765     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 766     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 767     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 768     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 769     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 770     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 771     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 772     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 773     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 774     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 775     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 776     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 777     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 778     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 779     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 780     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 781     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 782     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 783     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 784     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 785     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 786     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 787     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 788     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 789     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 790     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 791     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 792     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 793
 794     /* Coefficients for expansion of exp(x) in [0,0.1] */
 795     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 796     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 797     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 798     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 799
 800     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 801     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 802     const __m128  one      = _mm_set1_ps(1.0f);
 803     const __m128  two      = _mm_set1_ps(2.0f);
 804
 805     __m128 x2,x4,y;
 806     __m128 z,q,t,t2,w,w2;
 807     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
 808     __m128 expmx2,corr;
 809     __m128 res_erf,res_erfc,res;
 810     __m128 mask;
 811
 812     /* Calculate erf() */
 813     x2     = _mm_mul_ps(x,x);
 814     x4     = _mm_mul_ps(x2,x2);
 815
 816     pA0  = _mm_mul_ps(CA6,x4);
 817     pA1  = _mm_mul_ps(CA5,x4);
 818     pA0  = _mm_add_ps(pA0,CA4);
 819     pA1  = _mm_add_ps(pA1,CA3);
 820     pA0  = _mm_mul_ps(pA0,x4);
 821     pA1  = _mm_mul_ps(pA1,x4);
 822     pA0  = _mm_add_ps(pA0,CA2);
 823     pA1  = _mm_add_ps(pA1,CA1);
 824     pA0  = _mm_mul_ps(pA0,x4);
 825     pA1  = _mm_mul_ps(pA1,x2);
 826     pA0  = _mm_add_ps(pA0,pA1);
 827     pA0  = _mm_add_ps(pA0,CA0);
 828
 829     res_erf = _mm_mul_ps(x,pA0);
 830
 831     /* Calculate erfc */
 832
 833     y       = gmx_mm_abs_ps(x);
 834     t       = gmx_mm_inv_ps(y);
 835     w       = _mm_sub_ps(t,one);
 836     t2      = _mm_mul_ps(t,t);
 837     w2      = _mm_mul_ps(w,w);
 838     /*
 839      * We cannot simply calculate exp(-x2) directly in single precision, since
 840      * that will lose a couple of bits of precision due to the multiplication.
 841      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 842      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 843      *
 844      * The only drawback with this is that it requires TWO separate exponential
 845      * evaluations, which would be horrible performance-wise. However, the argument
 846      * for the second exp() call is always small, so there we simply use a
 847      * low-order minimax expansion on [0,0.1].
 848      */
 849
 850     z       = _mm_and_ps(y,sieve);
 851     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
 852
 853     corr    = _mm_mul_ps(CD4,q);
 854     corr    = _mm_add_ps(corr,CD3);
 855     corr    = _mm_mul_ps(corr,q);
 856     corr    = _mm_add_ps(corr,CD2);
 857     corr    = _mm_mul_ps(corr,q);
 858     corr    = _mm_add_ps(corr,one);
 859     corr    = _mm_mul_ps(corr,q);
 860     corr    = _mm_add_ps(corr,one);
 861
 862     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
 863     expmx2  = _mm_mul_ps(expmx2,corr);
 864
 865     pB1  = _mm_mul_ps(CB9,w2);
 866     pB0  = _mm_mul_ps(CB8,w2);
 867     pB1  = _mm_add_ps(pB1,CB7);
 868     pB0  = _mm_add_ps(pB0,CB6);
 869     pB1  = _mm_mul_ps(pB1,w2);
 870     pB0  = _mm_mul_ps(pB0,w2);
 871     pB1  = _mm_add_ps(pB1,CB5);
 872     pB0  = _mm_add_ps(pB0,CB4);
 873     pB1  = _mm_mul_ps(pB1,w2);
 874     pB0  = _mm_mul_ps(pB0,w2);
 875     pB1  = _mm_add_ps(pB1,CB3);
 876     pB0  = _mm_add_ps(pB0,CB2);
 877     pB1  = _mm_mul_ps(pB1,w2);
 878     pB0  = _mm_mul_ps(pB0,w2);
 879     pB1  = _mm_add_ps(pB1,CB1);
 880     pB1  = _mm_mul_ps(pB1,w);
 881     pB0  = _mm_add_ps(pB0,pB1);
 882     pB0  = _mm_add_ps(pB0,CB0);
 883
 884     pC0  = _mm_mul_ps(CC10,t2);
 885     pC1  = _mm_mul_ps(CC9,t2);
 886     pC0  = _mm_add_ps(pC0,CC8);
 887     pC1  = _mm_add_ps(pC1,CC7);
 888     pC0  = _mm_mul_ps(pC0,t2);
 889     pC1  = _mm_mul_ps(pC1,t2);
 890     pC0  = _mm_add_ps(pC0,CC6);
 891     pC1  = _mm_add_ps(pC1,CC5);
 892     pC0  = _mm_mul_ps(pC0,t2);
 893     pC1  = _mm_mul_ps(pC1,t2);
 894     pC0  = _mm_add_ps(pC0,CC4);
 895     pC1  = _mm_add_ps(pC1,CC3);
 896     pC0  = _mm_mul_ps(pC0,t2);
 897     pC1  = _mm_mul_ps(pC1,t2);
 898     pC0  = _mm_add_ps(pC0,CC2);
 899     pC1  = _mm_add_ps(pC1,CC1);
 900     pC0  = _mm_mul_ps(pC0,t2);
 901     pC1  = _mm_mul_ps(pC1,t);
 902     pC0  = _mm_add_ps(pC0,pC1);
 903     pC0  = _mm_add_ps(pC0,CC0);
 904     pC0  = _mm_mul_ps(pC0,t);
 905
 906     /* SELECT pB0 or pC0 for erfc() */
 907     mask = _mm_cmplt_ps(two,y);
 908     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
 909     res_erfc = _mm_mul_ps(res_erfc,expmx2);
 910
 911     /* erfc(x<0) = 2-erfc(|x|) */
 912     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
 913     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
 914
 915     /* Select erf() or erfc() */
 916     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
 917     res  = _mm_blendv_ps(_mm_sub_ps(one,res_erfc),res_erf,mask);
 918
 919     return res;
 920 }
 921
 922
 923
 924
 925 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
 926 static __m256
 927 gmx_mm256_erfc_ps(__m256 x)
 928 {
 929     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 930     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 931     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 932     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 933     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 934     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 935     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 936     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 937     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 938     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 939     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 940     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 941     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 942     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 943     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 944     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 945     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 946     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 947     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 948     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 949     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 950     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 951     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 952     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 953     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 954     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 955     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 956     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 957     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 958     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 959     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 960
 961     /* Coefficients for expansion of exp(x) in [0,0.1] */
 962     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 963     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 964     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 965     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 966
 967     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 968     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 969     const __m256  one      = _mm256_set1_ps(1.0f);
 970     const __m256  two      = _mm256_set1_ps(2.0f);
 971
 972     __m256 x2,x4,y;
 973     __m256 z,q,t,t2,w,w2;
 974     __m256 pA0,pA1,pB0,pB1,pC0,pC1;
 975     __m256 expmx2,corr;
 976     __m256 res_erf,res_erfc,res;
 977     __m256 mask;
 978
 979     /* Calculate erf() */
 980     x2     = _mm256_mul_ps(x,x);
 981     x4     = _mm256_mul_ps(x2,x2);
 982
 983     pA0  = _mm256_mul_ps(CA6,x4);
 984     pA1  = _mm256_mul_ps(CA5,x4);
 985     pA0  = _mm256_add_ps(pA0,CA4);
 986     pA1  = _mm256_add_ps(pA1,CA3);
 987     pA0  = _mm256_mul_ps(pA0,x4);
 988     pA1  = _mm256_mul_ps(pA1,x4);
 989     pA0  = _mm256_add_ps(pA0,CA2);
 990     pA1  = _mm256_add_ps(pA1,CA1);
 991     pA0  = _mm256_mul_ps(pA0,x4);
 992     pA1  = _mm256_mul_ps(pA1,x2);
 993     pA0  = _mm256_add_ps(pA0,pA1);
 994     pA0  = _mm256_add_ps(pA0,CA0);
 995
 996     res_erf = _mm256_mul_ps(x,pA0);
 997
 998     /* Calculate erfc */
 999     y       = gmx_mm256_abs_ps(x);
1000     t       = gmx_mm256_inv_ps(y);
1001     w       = _mm256_sub_ps(t,one);
1002     t2      = _mm256_mul_ps(t,t);
1003     w2      = _mm256_mul_ps(w,w);
1004     /*
1005      * We cannot simply calculate exp(-x2) directly in single precision, since
1006      * that will lose a couple of bits of precision due to the multiplication.
1007      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1008      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1009      *
1010      * The only drawback with this is that it requires TWO separate exponential
1011      * evaluations, which would be horrible performance-wise. However, the argument
1012      * for the second exp() call is always small, so there we simply use a
1013      * low-order minimax expansion on [0,0.1].
1014      */
1015
1016     z       = _mm256_and_ps(y,sieve);
1017     q       = _mm256_mul_ps( _mm256_sub_ps(z,y) , _mm256_add_ps(z,y) );
1018
1019     corr    = _mm256_mul_ps(CD4,q);
1020     corr    = _mm256_add_ps(corr,CD3);
1021     corr    = _mm256_mul_ps(corr,q);
1022     corr    = _mm256_add_ps(corr,CD2);
1023     corr    = _mm256_mul_ps(corr,q);
1024     corr    = _mm256_add_ps(corr,one);
1025     corr    = _mm256_mul_ps(corr,q);
1026     corr    = _mm256_add_ps(corr,one);
1027
1028     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit , _mm256_mul_ps(z,z) ) );
1029     expmx2  = _mm256_mul_ps(expmx2,corr);
1030
1031     pB1  = _mm256_mul_ps(CB9,w2);
1032     pB0  = _mm256_mul_ps(CB8,w2);
1033     pB1  = _mm256_add_ps(pB1,CB7);
1034     pB0  = _mm256_add_ps(pB0,CB6);
1035     pB1  = _mm256_mul_ps(pB1,w2);
1036     pB0  = _mm256_mul_ps(pB0,w2);
1037     pB1  = _mm256_add_ps(pB1,CB5);
1038     pB0  = _mm256_add_ps(pB0,CB4);
1039     pB1  = _mm256_mul_ps(pB1,w2);
1040     pB0  = _mm256_mul_ps(pB0,w2);
1041     pB1  = _mm256_add_ps(pB1,CB3);
1042     pB0  = _mm256_add_ps(pB0,CB2);
1043     pB1  = _mm256_mul_ps(pB1,w2);
1044     pB0  = _mm256_mul_ps(pB0,w2);
1045     pB1  = _mm256_add_ps(pB1,CB1);
1046     pB1  = _mm256_mul_ps(pB1,w);
1047     pB0  = _mm256_add_ps(pB0,pB1);
1048     pB0  = _mm256_add_ps(pB0,CB0);
1049
1050     pC0  = _mm256_mul_ps(CC10,t2);
1051     pC1  = _mm256_mul_ps(CC9,t2);
1052     pC0  = _mm256_add_ps(pC0,CC8);
1053     pC1  = _mm256_add_ps(pC1,CC7);
1054     pC0  = _mm256_mul_ps(pC0,t2);
1055     pC1  = _mm256_mul_ps(pC1,t2);
1056     pC0  = _mm256_add_ps(pC0,CC6);
1057     pC1  = _mm256_add_ps(pC1,CC5);
1058     pC0  = _mm256_mul_ps(pC0,t2);
1059     pC1  = _mm256_mul_ps(pC1,t2);
1060     pC0  = _mm256_add_ps(pC0,CC4);
1061     pC1  = _mm256_add_ps(pC1,CC3);
1062     pC0  = _mm256_mul_ps(pC0,t2);
1063     pC1  = _mm256_mul_ps(pC1,t2);
1064     pC0  = _mm256_add_ps(pC0,CC2);
1065     pC1  = _mm256_add_ps(pC1,CC1);
1066     pC0  = _mm256_mul_ps(pC0,t2);
1067     pC1  = _mm256_mul_ps(pC1,t);
1068     pC0  = _mm256_add_ps(pC0,pC1);
1069     pC0  = _mm256_add_ps(pC0,CC0);
1070     pC0  = _mm256_mul_ps(pC0,t);
1071
1072     /* SELECT pB0 or pC0 for erfc() */
1073     mask = _mm256_cmp_ps(two,y,_CMP_LT_OQ);
1074     res_erfc = _mm256_blendv_ps(pB0,pC0,mask);
1075     res_erfc = _mm256_mul_ps(res_erfc,expmx2);
1076
1077     /* erfc(x<0) = 2-erfc(|x|) */
1078     mask = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
1079     res_erfc = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(two,res_erfc),mask);
1080
1081     /* Select erf() or erfc() */
1082     mask = _mm256_cmp_ps(y,_mm256_set1_ps(0.75f),_CMP_LT_OQ);
1083     res  = _mm256_blendv_ps(res_erfc,_mm256_sub_ps(one,res_erf),mask);
1084
1085     return res;
1086 }
1087
1088
1089 /* erfc(), 128 bit wide */
1090 static __m128
1091 gmx_mm_erfc_ps(__m128 x)
1092 {
1093     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1094     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
1095     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
1096     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
1097     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
1098     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
1099     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
1100     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
1101     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1102     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
1103     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
1104     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
1105     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
1106     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
1107     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
1108     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
1109     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
1110     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
1111     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
1112     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1113     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
1114     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
1115     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
1116     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
1117     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
1118     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
1119     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
1120     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
1121     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
1122     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
1123     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
1124
1125     /* Coefficients for expansion of exp(x) in [0,0.1] */
1126     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1127     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
1128     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
1129     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
1130
1131     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1132     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1133     const __m128  one      = _mm_set1_ps(1.0f);
1134     const __m128  two      = _mm_set1_ps(2.0f);
1135
1136     __m128 x2,x4,y;
1137     __m128 z,q,t,t2,w,w2;
1138     __m128 pA0,pA1,pB0,pB1,pC0,pC1;
1139     __m128 expmx2,corr;
1140     __m128 res_erf,res_erfc,res;
1141     __m128 mask;
1142
1143     /* Calculate erf() */
1144     x2     = _mm_mul_ps(x,x);
1145     x4     = _mm_mul_ps(x2,x2);
1146
1147     pA0  = _mm_mul_ps(CA6,x4);
1148     pA1  = _mm_mul_ps(CA5,x4);
1149     pA0  = _mm_add_ps(pA0,CA4);
1150     pA1  = _mm_add_ps(pA1,CA3);
1151     pA0  = _mm_mul_ps(pA0,x4);
1152     pA1  = _mm_mul_ps(pA1,x4);
1153     pA0  = _mm_add_ps(pA0,CA2);
1154     pA1  = _mm_add_ps(pA1,CA1);
1155     pA0  = _mm_mul_ps(pA0,x4);
1156     pA1  = _mm_mul_ps(pA1,x2);
1157     pA0  = _mm_add_ps(pA0,pA1);
1158     pA0  = _mm_add_ps(pA0,CA0);
1159
1160     res_erf = _mm_mul_ps(x,pA0);
1161
1162     /* Calculate erfc */
1163     y       = gmx_mm_abs_ps(x);
1164     t       = gmx_mm_inv_ps(y);
1165     w       = _mm_sub_ps(t,one);
1166     t2      = _mm_mul_ps(t,t);
1167     w2      = _mm_mul_ps(w,w);
1168     /*
1169      * We cannot simply calculate exp(-x2) directly in single precision, since
1170      * that will lose a couple of bits of precision due to the multiplication.
1171      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1172      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1173      *
1174      * The only drawback with this is that it requires TWO separate exponential
1175      * evaluations, which would be horrible performance-wise. However, the argument
1176      * for the second exp() call is always small, so there we simply use a
1177      * low-order minimax expansion on [0,0.1].
1178      */
1179
1180     z       = _mm_and_ps(y,sieve);
1181     q       = _mm_mul_ps( _mm_sub_ps(z,y) , _mm_add_ps(z,y) );
1182
1183     corr    = _mm_mul_ps(CD4,q);
1184     corr    = _mm_add_ps(corr,CD3);
1185     corr    = _mm_mul_ps(corr,q);
1186     corr    = _mm_add_ps(corr,CD2);
1187     corr    = _mm_mul_ps(corr,q);
1188     corr    = _mm_add_ps(corr,one);
1189     corr    = _mm_mul_ps(corr,q);
1190     corr    = _mm_add_ps(corr,one);
1191
1192     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit , _mm_mul_ps(z,z) ) );
1193     expmx2  = _mm_mul_ps(expmx2,corr);
1194
1195     pB1  = _mm_mul_ps(CB9,w2);
1196     pB0  = _mm_mul_ps(CB8,w2);
1197     pB1  = _mm_add_ps(pB1,CB7);
1198     pB0  = _mm_add_ps(pB0,CB6);
1199     pB1  = _mm_mul_ps(pB1,w2);
1200     pB0  = _mm_mul_ps(pB0,w2);
1201     pB1  = _mm_add_ps(pB1,CB5);
1202     pB0  = _mm_add_ps(pB0,CB4);
1203     pB1  = _mm_mul_ps(pB1,w2);
1204     pB0  = _mm_mul_ps(pB0,w2);
1205     pB1  = _mm_add_ps(pB1,CB3);
1206     pB0  = _mm_add_ps(pB0,CB2);
1207     pB1  = _mm_mul_ps(pB1,w2);
1208     pB0  = _mm_mul_ps(pB0,w2);
1209     pB1  = _mm_add_ps(pB1,CB1);
1210     pB1  = _mm_mul_ps(pB1,w);
1211     pB0  = _mm_add_ps(pB0,pB1);
1212     pB0  = _mm_add_ps(pB0,CB0);
1213
1214     pC0  = _mm_mul_ps(CC10,t2);
1215     pC1  = _mm_mul_ps(CC9,t2);
1216     pC0  = _mm_add_ps(pC0,CC8);
1217     pC1  = _mm_add_ps(pC1,CC7);
1218     pC0  = _mm_mul_ps(pC0,t2);
1219     pC1  = _mm_mul_ps(pC1,t2);
1220     pC0  = _mm_add_ps(pC0,CC6);
1221     pC1  = _mm_add_ps(pC1,CC5);
1222     pC0  = _mm_mul_ps(pC0,t2);
1223     pC1  = _mm_mul_ps(pC1,t2);
1224     pC0  = _mm_add_ps(pC0,CC4);
1225     pC1  = _mm_add_ps(pC1,CC3);
1226     pC0  = _mm_mul_ps(pC0,t2);
1227     pC1  = _mm_mul_ps(pC1,t2);
1228     pC0  = _mm_add_ps(pC0,CC2);
1229     pC1  = _mm_add_ps(pC1,CC1);
1230     pC0  = _mm_mul_ps(pC0,t2);
1231     pC1  = _mm_mul_ps(pC1,t);
1232     pC0  = _mm_add_ps(pC0,pC1);
1233     pC0  = _mm_add_ps(pC0,CC0);
1234     pC0  = _mm_mul_ps(pC0,t);
1235
1236     /* SELECT pB0 or pC0 for erfc() */
1237     mask = _mm_cmplt_ps(two,y);
1238     res_erfc = _mm_blendv_ps(pB0,pC0,mask);
1239     res_erfc = _mm_mul_ps(res_erfc,expmx2);
1240
1241     /* erfc(x<0) = 2-erfc(|x|) */
1242     mask = _mm_cmplt_ps(x,_mm_setzero_ps());
1243     res_erfc = _mm_blendv_ps(res_erfc,_mm_sub_ps(two,res_erfc),mask);
1244
1245     /* Select erf() or erfc() */
1246     mask = _mm_cmplt_ps(y,_mm_set1_ps(0.75f));
1247     res  = _mm_blendv_ps(res_erfc,_mm_sub_ps(one,res_erf),mask);
1248
1249     return res;
1250 }
1251
1252
1253
1254 /* Calculate the force correction due to PME analytically.
1255  *
1256  * This routine is meant to enable analytical evaluation of the
1257  * direct-space PME electrostatic force to avoid tables.
1258  *
1259  * The direct-space potential should be Erfc(beta*r)/r, but there
1260  * are some problems evaluating that:
1261  *
1262  * First, the error function is difficult (read: expensive) to
1263  * approxmiate accurately for intermediate to large arguments, and
1264  * this happens already in ranges of beta*r that occur in simulations.
1265  * Second, we now try to avoid calculating potentials in Gromacs but
1266  * use forces directly.
1267  *
1268  * We can simply things slight by noting that the PME part is really
1269  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1270  *
1271  * V= 1/r - Erf(beta*r)/r
1272  *
1273  * The first term we already have from the inverse square root, so
1274  * that we can leave out of this routine.
1275  *
1276  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1277  * the argument beta*r will be in the range 0.15 to ~4. Use your
1278  * favorite plotting program to realize how well-behaved Erf(z)/z is
1279  * in this range!
1280  *
1281  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1282  * However, it turns out it is more efficient to approximate f(z)/z and
1283  * then only use even powers. This is another minor optimization, since
1284  * we actually WANT f(z)/z, because it is going to be multiplied by
1285  * the vector between the two atoms to get the vectorial force. The
1286  * fastest flops are the ones we can avoid calculating!
1287  *
1288  * So, here's how it should be used:
1289  *
1290  * 1. Calculate r^2.
1291  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1292  * 3. Evaluate this routine with z^2 as the argument.
1293  * 4. The return value is the expression:
1294  *
1295  *
1296  *       2*exp(-z^2)     erf(z)
1297  *       ------------ - --------
1298  *       sqrt(Pi)*z^2      z^3
1299  *
1300  * 5. Multiply the entire expression by beta^3. This will get you
1301  *
1302  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
1303  *       ------------------  - ---------------
1304  *          sqrt(Pi)*z^2            z^3
1305  *
1306  *    or, switching back to r (z=r*beta):
1307  *
1308  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
1309  *       ----------------------- - -----------
1310  *            sqrt(Pi)*r^2            r^3
1311  *
1312  *
1313  *    With a bit of math exercise you should be able to confirm that
1314  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1315  *
1316  * 6. Add the result to 1/r^3, multiply by the product of the charges,
1317  *    and you have your force (divided by r). A final multiplication
1318  *    with the vector connecting the two particles and you have your
1319  *    vectorial force to add to the particles.
1320  *
1321  */
1322 static __m256
1323 gmx_mm256_pmecorrF_ps(__m256 z2)
1324 {
1325     const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
1326     const __m256  FN5      = _mm256_set1_ps(1.4703624142580877519e-6f);
1327     const __m256  FN4      = _mm256_set1_ps(-0.000053401640219807709149f);
1328     const __m256  FN3      = _mm256_set1_ps(0.0010054721316683106153f);
1329     const __m256  FN2      = _mm256_set1_ps(-0.019278317264888380590f);
1330     const __m256  FN1      = _mm256_set1_ps(0.069670166153766424023f);
1331     const __m256  FN0      = _mm256_set1_ps(-0.75225204789749321333f);
1332
1333     const __m256  FD4      = _mm256_set1_ps(0.0011193462567257629232f);
1334     const __m256  FD3      = _mm256_set1_ps(0.014866955030185295499f);
1335     const __m256  FD2      = _mm256_set1_ps(0.11583842382862377919f);
1336     const __m256  FD1      = _mm256_set1_ps(0.50736591960530292870f);
1337     const __m256  FD0      = _mm256_set1_ps(1.0f);
1338
1339     __m256 z4;
1340     __m256 polyFN0,polyFN1,polyFD0,polyFD1;
1341
1342     z4             = _mm256_mul_ps(z2,z2);
1343
1344     polyFD0        = _mm256_mul_ps(FD4,z4);
1345     polyFD1        = _mm256_mul_ps(FD3,z4);
1346     polyFD0        = _mm256_add_ps(polyFD0,FD2);
1347     polyFD1        = _mm256_add_ps(polyFD1,FD1);
1348     polyFD0        = _mm256_mul_ps(polyFD0,z4);
1349     polyFD1        = _mm256_mul_ps(polyFD1,z2);
1350     polyFD0        = _mm256_add_ps(polyFD0,FD0);
1351     polyFD0        = _mm256_add_ps(polyFD0,polyFD1);
1352
1353     polyFD0        = gmx_mm256_inv_ps(polyFD0);
1354
1355     polyFN0        = _mm256_mul_ps(FN6,z4);
1356     polyFN1        = _mm256_mul_ps(FN5,z4);
1357     polyFN0        = _mm256_add_ps(polyFN0,FN4);
1358     polyFN1        = _mm256_add_ps(polyFN1,FN3);
1359     polyFN0        = _mm256_mul_ps(polyFN0,z4);
1360     polyFN1        = _mm256_mul_ps(polyFN1,z4);
1361     polyFN0        = _mm256_add_ps(polyFN0,FN2);
1362     polyFN1        = _mm256_add_ps(polyFN1,FN1);
1363     polyFN0        = _mm256_mul_ps(polyFN0,z4);
1364     polyFN1        = _mm256_mul_ps(polyFN1,z2);
1365     polyFN0        = _mm256_add_ps(polyFN0,FN0);
1366     polyFN0        = _mm256_add_ps(polyFN0,polyFN1);
1367
1368     return   _mm256_mul_ps(polyFN0,polyFD0);
1369 }
1370
1371
1372 static __m128
1373 gmx_mm_pmecorrF_ps(__m128 z2)
1374 {
1375     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
1376     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
1377     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
1378     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
1379     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
1380     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
1381     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
1382
1383     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
1384     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
1385     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
1386     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
1387     const __m128  FD0      = _mm_set1_ps(1.0f);
1388
1389     __m128 z4;
1390     __m128 polyFN0,polyFN1,polyFD0,polyFD1;
1391
1392     z4             = _mm_mul_ps(z2,z2);
1393
1394     polyFD0        = _mm_mul_ps(FD4,z4);
1395     polyFD1        = _mm_mul_ps(FD3,z4);
1396     polyFD0        = _mm_add_ps(polyFD0,FD2);
1397     polyFD1        = _mm_add_ps(polyFD1,FD1);
1398     polyFD0        = _mm_mul_ps(polyFD0,z4);
1399     polyFD1        = _mm_mul_ps(polyFD1,z2);
1400     polyFD0        = _mm_add_ps(polyFD0,FD0);
1401     polyFD0        = _mm_add_ps(polyFD0,polyFD1);
1402
1403     polyFD0        = gmx_mm_inv_ps(polyFD0);
1404
1405     polyFN0        = _mm_mul_ps(FN6,z4);
1406     polyFN1        = _mm_mul_ps(FN5,z4);
1407     polyFN0        = _mm_add_ps(polyFN0,FN4);
1408     polyFN1        = _mm_add_ps(polyFN1,FN3);
1409     polyFN0        = _mm_mul_ps(polyFN0,z4);
1410     polyFN1        = _mm_mul_ps(polyFN1,z4);
1411     polyFN0        = _mm_add_ps(polyFN0,FN2);
1412     polyFN1        = _mm_add_ps(polyFN1,FN1);
1413     polyFN0        = _mm_mul_ps(polyFN0,z4);
1414     polyFN1        = _mm_mul_ps(polyFN1,z2);
1415     polyFN0        = _mm_add_ps(polyFN0,FN0);
1416     polyFN0        = _mm_add_ps(polyFN0,polyFN1);
1417
1418     return   _mm_mul_ps(polyFN0,polyFD0);
1419 }
1420
1421
1422
1423 /* Calculate the potential correction due to PME analytically.
1424  *
1425  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1426  *
1427  * This routine calculates Erf(z)/z, although you should provide z^2
1428  * as the input argument.
1429  *
1430  * Here's how it should be used:
1431  *
1432  * 1. Calculate r^2.
1433  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1434  * 3. Evaluate this routine with z^2 as the argument.
1435  * 4. The return value is the expression:
1436  *
1437  *
1438  *        erf(z)
1439  *       --------
1440  *          z
1441  *
1442  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1443  *
1444  *       erf(r*beta)
1445  *       -----------
1446  *           r
1447  *
1448  * 6. Subtract the result from 1/r, multiply by the product of the charges,
1449  *    and you have your potential.
1450  */
1451 static __m256
1452 gmx_mm256_pmecorrV_ps(__m256 z2)
1453 {
1454     const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
1455     const __m256  VN5      = _mm256_set1_ps(-1.4213390571557850962e-6f);
1456     const __m256  VN4      = _mm256_set1_ps(0.000041603292906656984871f);
1457     const __m256  VN3      = _mm256_set1_ps(-0.00013134036773265025626f);
1458     const __m256  VN2      = _mm256_set1_ps(0.038657983986041781264f);
1459     const __m256  VN1      = _mm256_set1_ps(0.11285044772717598220f);
1460     const __m256  VN0      = _mm256_set1_ps(1.1283802385263030286f);
1461
1462     const __m256  VD3      = _mm256_set1_ps(0.0066752224023576045451f);
1463     const __m256  VD2      = _mm256_set1_ps(0.078647795836373922256f);
1464     const __m256  VD1      = _mm256_set1_ps(0.43336185284710920150f);
1465     const __m256  VD0      = _mm256_set1_ps(1.0f);
1466
1467     __m256 z4;
1468     __m256 polyVN0,polyVN1,polyVD0,polyVD1;
1469
1470     z4             = _mm256_mul_ps(z2,z2);
1471
1472     polyVD1        = _mm256_mul_ps(VD3,z4);
1473     polyVD0        = _mm256_mul_ps(VD2,z4);
1474     polyVD1        = _mm256_add_ps(polyVD1,VD1);
1475     polyVD0        = _mm256_add_ps(polyVD0,VD0);
1476     polyVD1        = _mm256_mul_ps(polyVD1,z2);
1477     polyVD0        = _mm256_add_ps(polyVD0,polyVD1);
1478
1479     polyVD0        = gmx_mm256_inv_ps(polyVD0);
1480
1481     polyVN0        = _mm256_mul_ps(VN6,z4);
1482     polyVN1        = _mm256_mul_ps(VN5,z4);
1483     polyVN0        = _mm256_add_ps(polyVN0,VN4);
1484     polyVN1        = _mm256_add_ps(polyVN1,VN3);
1485     polyVN0        = _mm256_mul_ps(polyVN0,z4);
1486     polyVN1        = _mm256_mul_ps(polyVN1,z4);
1487     polyVN0        = _mm256_add_ps(polyVN0,VN2);
1488     polyVN1        = _mm256_add_ps(polyVN1,VN1);
1489     polyVN0        = _mm256_mul_ps(polyVN0,z4);
1490     polyVN1        = _mm256_mul_ps(polyVN1,z2);
1491     polyVN0        = _mm256_add_ps(polyVN0,VN0);
1492     polyVN0        = _mm256_add_ps(polyVN0,polyVN1);
1493
1494     return   _mm256_mul_ps(polyVN0,polyVD0);
1495 }
1496
1497
1498 static __m128
1499 gmx_mm_pmecorrV_ps(__m128 z2)
1500 {
1501     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
1502     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
1503     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
1504     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
1505     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
1506     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
1507     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
1508
1509     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
1510     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
1511     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
1512     const __m128  VD0      = _mm_set1_ps(1.0f);
1513
1514     __m128 z4;
1515     __m128 polyVN0,polyVN1,polyVD0,polyVD1;
1516
1517     z4             = _mm_mul_ps(z2,z2);
1518
1519     polyVD1        = _mm_mul_ps(VD3,z4);
1520     polyVD0        = _mm_mul_ps(VD2,z4);
1521     polyVD1        = _mm_add_ps(polyVD1,VD1);
1522     polyVD0        = _mm_add_ps(polyVD0,VD0);
1523     polyVD1        = _mm_mul_ps(polyVD1,z2);
1524     polyVD0        = _mm_add_ps(polyVD0,polyVD1);
1525
1526     polyVD0        = gmx_mm_inv_ps(polyVD0);
1527
1528     polyVN0        = _mm_mul_ps(VN6,z4);
1529     polyVN1        = _mm_mul_ps(VN5,z4);
1530     polyVN0        = _mm_add_ps(polyVN0,VN4);
1531     polyVN1        = _mm_add_ps(polyVN1,VN3);
1532     polyVN0        = _mm_mul_ps(polyVN0,z4);
1533     polyVN1        = _mm_mul_ps(polyVN1,z4);
1534     polyVN0        = _mm_add_ps(polyVN0,VN2);
1535     polyVN1        = _mm_add_ps(polyVN1,VN1);
1536     polyVN0        = _mm_mul_ps(polyVN0,z4);
1537     polyVN1        = _mm_mul_ps(polyVN1,z2);
1538     polyVN0        = _mm_add_ps(polyVN0,VN0);
1539     polyVN0        = _mm_add_ps(polyVN0,polyVN1);
1540
1541     return   _mm_mul_ps(polyVN0,polyVD0);
1542 }
1543
1544
1545 static int
1546 gmx_mm256_sincos_ps(__m256 x,
1547                     __m256 *sinval,
1548                     __m256 *cosval)
1549 {
1550     const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1551     const __m256 half        = _mm256_set1_ps(0.5f);
1552     const __m256 one         = _mm256_set1_ps(1.0f);
1553     const __m256 zero        = _mm256_setzero_ps();
1554
1555     const __m128i ione       = _mm_set1_epi32(1);
1556
1557     const __m256 mask_one    = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1558     const __m256 mask_two    = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1559     const __m256 mask_three  = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1560
1561     const __m256 CA1         = _mm256_set1_ps(1.5703125f);
1562     const __m256 CA2         = _mm256_set1_ps(4.837512969970703125e-4f);
1563     const __m256 CA3         = _mm256_set1_ps(7.54978995489188216e-8f);
1564
1565     const __m256 CC0         = _mm256_set1_ps(-0.0013602249f);
1566     const __m256 CC1         = _mm256_set1_ps(0.0416566950f);
1567     const __m256 CC2         = _mm256_set1_ps(-0.4999990225f);
1568     const __m256 CS0         = _mm256_set1_ps(-0.0001950727f);
1569     const __m256 CS1         = _mm256_set1_ps(0.0083320758f);
1570     const __m256 CS2         = _mm256_set1_ps(-0.1666665247f);
1571
1572     const __m256  signbit    = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1573
1574     __m256 y,y2;
1575     __m256 z;
1576     __m256i iz;
1577     __m128i iz_high,iz_low;
1578     __m256 offset_sin,offset_cos;
1579     __m256 mask_sin,mask_cos;
1580     __m256 tmp1,tmp2;
1581     __m256 tmp_sin,tmp_cos;
1582
1583     y               = _mm256_mul_ps(x,two_over_pi);
1584     y               = _mm256_add_ps(y,_mm256_or_ps(_mm256_and_ps(y,signbit),half));
1585
1586     iz              = _mm256_cvttps_epi32(y);
1587     z               = _mm256_round_ps(y,_MM_FROUND_TO_ZERO);
1588
1589     offset_sin      = _mm256_and_ps(_mm256_castsi256_ps(iz),mask_three);
1590
1591     iz_high         = _mm256_extractf128_si256(iz,0x1);
1592     iz_low          = _mm256_castsi256_si128(iz);
1593     iz_low          = _mm_add_epi32(iz_low,ione);
1594     iz_high         = _mm_add_epi32(iz_high,ione);
1595     iz              = _mm256_castsi128_si256(iz_low);
1596     iz              = _mm256_insertf128_si256(iz,iz_high,0x1);
1597     offset_cos      = _mm256_castsi256_ps(iz);
1598
1599     /* Extended precision arithmethic to achieve full precision */
1600     y               = _mm256_mul_ps(z,CA1);
1601     tmp1            = _mm256_mul_ps(z,CA2);
1602     tmp2            = _mm256_mul_ps(z,CA3);
1603     y               = _mm256_sub_ps(x,y);
1604     y               = _mm256_sub_ps(y,tmp1);
1605     y               = _mm256_sub_ps(y,tmp2);
1606
1607     y2              = _mm256_mul_ps(y,y);
1608
1609     tmp1            = _mm256_mul_ps(CC0,y2);
1610     tmp1            = _mm256_add_ps(tmp1,CC1);
1611     tmp2            = _mm256_mul_ps(CS0,y2);
1612     tmp2            = _mm256_add_ps(tmp2,CS1);
1613     tmp1            = _mm256_mul_ps(tmp1,y2);
1614     tmp1            = _mm256_add_ps(tmp1,CC2);
1615     tmp2            = _mm256_mul_ps(tmp2,y2);
1616     tmp2            = _mm256_add_ps(tmp2,CS2);
1617
1618     tmp1            = _mm256_mul_ps(tmp1,y2);
1619     tmp1            = _mm256_add_ps(tmp1,one);
1620
1621     tmp2            = _mm256_mul_ps(tmp2,_mm256_mul_ps(y,y2));
1622     tmp2            = _mm256_add_ps(tmp2,y);
1623
1624 #ifdef __INTEL_COMPILER
1625     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1626     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_one))),zero,_CMP_EQ_OQ);
1627     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_one))),zero,_CMP_EQ_OQ);
1628 #else
1629     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_one), zero, _CMP_EQ_OQ);
1630     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_one), zero, _CMP_EQ_OQ);
1631 #endif
1632     tmp_sin         = _mm256_blendv_ps(tmp1,tmp2,mask_sin);
1633     tmp_cos         = _mm256_blendv_ps(tmp1,tmp2,mask_cos);
1634
1635     tmp1            = _mm256_xor_ps(signbit,tmp_sin);
1636     tmp2            = _mm256_xor_ps(signbit,tmp_cos);
1637
1638 #ifdef __INTEL_COMPILER
1639     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1640     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin,mask_two))),zero,_CMP_EQ_OQ);
1641     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos,mask_two))),zero,_CMP_EQ_OQ);
1642 #else
1643     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin,mask_two), zero, _CMP_EQ_OQ);
1644     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos,mask_two), zero, _CMP_EQ_OQ);
1645
1646 #endif
1647     *sinval         = _mm256_blendv_ps(tmp1,tmp_sin,mask_sin);
1648     *cosval         = _mm256_blendv_ps(tmp2,tmp_cos,mask_cos);
1649
1650     return 0;
1651 }
1652
1653 static int
1654 gmx_mm_sincos_ps(__m128 x,
1655                  __m128 *sinval,
1656                  __m128 *cosval)
1657 {
1658     const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
1659     const __m128 half        = _mm_set1_ps(0.5);
1660     const __m128 one         = _mm_set1_ps(1.0);
1661
1662     const __m128i izero      = _mm_set1_epi32(0);
1663     const __m128i ione       = _mm_set1_epi32(1);
1664     const __m128i itwo       = _mm_set1_epi32(2);
1665     const __m128i ithree     = _mm_set1_epi32(3);
1666     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1667
1668     const __m128 CA1         = _mm_set1_ps(1.5703125f);
1669     const __m128 CA2         = _mm_set1_ps(4.837512969970703125e-4f);
1670     const __m128 CA3         = _mm_set1_ps(7.54978995489188216e-8f);
1671
1672     const __m128 CC0         = _mm_set1_ps(-0.0013602249f);
1673     const __m128 CC1         = _mm_set1_ps(0.0416566950f);
1674     const __m128 CC2         = _mm_set1_ps(-0.4999990225f);
1675     const __m128 CS0         = _mm_set1_ps(-0.0001950727f);
1676     const __m128 CS1         = _mm_set1_ps(0.0083320758f);
1677     const __m128 CS2         = _mm_set1_ps(-0.1666665247f);
1678
1679     __m128  y,y2;
1680     __m128  z;
1681     __m128i iz;
1682     __m128i offset_sin,offset_cos;
1683     __m128  tmp1,tmp2;
1684     __m128  mask_sin,mask_cos;
1685     __m128  tmp_sin,tmp_cos;
1686
1687     y          = _mm_mul_ps(x,two_over_pi);
1688     y          = _mm_add_ps(y,_mm_or_ps(_mm_and_ps(y,signbit),half));
1689
1690     iz         = _mm_cvttps_epi32(y);
1691     z          = _mm_round_ps(y,_MM_FROUND_TO_ZERO);
1692
1693     offset_sin = _mm_and_si128(iz,ithree);
1694     offset_cos = _mm_add_epi32(iz,ione);
1695
1696     /* Extended precision arithmethic to achieve full precision */
1697     y               = _mm_mul_ps(z,CA1);
1698     tmp1            = _mm_mul_ps(z,CA2);
1699     tmp2            = _mm_mul_ps(z,CA3);
1700     y               = _mm_sub_ps(x,y);
1701     y               = _mm_sub_ps(y,tmp1);
1702     y               = _mm_sub_ps(y,tmp2);
1703
1704     y2              = _mm_mul_ps(y,y);
1705
1706     tmp1            = _mm_mul_ps(CC0,y2);
1707     tmp1            = _mm_add_ps(tmp1,CC1);
1708     tmp2            = _mm_mul_ps(CS0,y2);
1709     tmp2            = _mm_add_ps(tmp2,CS1);
1710     tmp1            = _mm_mul_ps(tmp1,y2);
1711     tmp1            = _mm_add_ps(tmp1,CC2);
1712     tmp2            = _mm_mul_ps(tmp2,y2);
1713     tmp2            = _mm_add_ps(tmp2,CS2);
1714
1715     tmp1            = _mm_mul_ps(tmp1,y2);
1716     tmp1            = _mm_add_ps(tmp1,one);
1717
1718     tmp2            = _mm_mul_ps(tmp2,_mm_mul_ps(y,y2));
1719     tmp2            = _mm_add_ps(tmp2,y);
1720
1721     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,ione), izero));
1722     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,ione), izero));
1723
1724     tmp_sin         = _mm_blendv_ps(tmp1,tmp2,mask_sin);
1725     tmp_cos         = _mm_blendv_ps(tmp1,tmp2,mask_cos);
1726
1727     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin,itwo), izero));
1728     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos,itwo), izero));
1729
1730     tmp1            = _mm_xor_ps(signbit,tmp_sin);
1731     tmp2            = _mm_xor_ps(signbit,tmp_cos);
1732
1733     *sinval         = _mm_blendv_ps(tmp1,tmp_sin,mask_sin);
1734     *cosval         = _mm_blendv_ps(tmp2,tmp_cos,mask_cos);
1735
1736     return 0;
1737 }
1738
1739
1740
1741
1742 /*
1743  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1744  * will then call the sincos() routine and waste a factor 2 in performance!
1745  */
1746 static __m256
1747 gmx_mm256_sin_ps(__m256 x)
1748 {
1749     __m256 s,c;
1750     gmx_mm256_sincos_ps(x,&s,&c);
1751     return s;
1752 }
1753
1754 static __m128
1755 gmx_mm_sin_ps(__m128 x)
1756 {
1757     __m128 s,c;
1758     gmx_mm_sincos_ps(x,&s,&c);
1759     return s;
1760 }
1761
1762
1763 /*
1764  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1765  * will then call the sincos() routine and waste a factor 2 in performance!
1766  */
1767 static __m256
1768 gmx_mm256_cos_ps(__m256 x)
1769 {
1770     __m256 s,c;
1771     gmx_mm256_sincos_ps(x,&s,&c);
1772     return c;
1773 }
1774
1775 static __m128
1776 gmx_mm_cos_ps(__m128 x)
1777 {
1778     __m128 s,c;
1779     gmx_mm_sincos_ps(x,&s,&c);
1780     return c;
1781 }
1782
1783
1784 static __m256
1785 gmx_mm256_tan_ps(__m256 x)
1786 {
1787     __m256 sinval,cosval;
1788     __m256 tanval;
1789
1790     gmx_mm256_sincos_ps(x,&sinval,&cosval);
1791
1792     tanval = _mm256_mul_ps(sinval,gmx_mm256_inv_ps(cosval));
1793
1794     return tanval;
1795 }
1796
1797 static __m128
1798 gmx_mm_tan_ps(__m128 x)
1799 {
1800     __m128 sinval,cosval;
1801     __m128 tanval;
1802
1803     gmx_mm_sincos_ps(x,&sinval,&cosval);
1804
1805     tanval = _mm_mul_ps(sinval,gmx_mm_inv_ps(cosval));
1806
1807     return tanval;
1808 }
1809
1810
1811 static __m256
1812 gmx_mm256_asin_ps(__m256 x)
1813 {
1814     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1815     const __m256 limitlow  = _mm256_set1_ps(1e-4f);
1816     const __m256 half      = _mm256_set1_ps(0.5f);
1817     const __m256 one       = _mm256_set1_ps(1.0f);
1818     const __m256 halfpi    = _mm256_set1_ps((float)M_PI/2.0f);
1819
1820     const __m256 CC5        = _mm256_set1_ps(4.2163199048E-2f);
1821     const __m256 CC4        = _mm256_set1_ps(2.4181311049E-2f);
1822     const __m256 CC3        = _mm256_set1_ps(4.5470025998E-2f);
1823     const __m256 CC2        = _mm256_set1_ps(7.4953002686E-2f);
1824     const __m256 CC1        = _mm256_set1_ps(1.6666752422E-1f);
1825
1826     __m256 sign;
1827     __m256 mask;
1828     __m256 xabs;
1829     __m256 z,z1,z2,q,q1,q2;
1830     __m256 pA,pB;
1831
1832     sign  = _mm256_andnot_ps(signmask,x);
1833     xabs  = _mm256_and_ps(x,signmask);
1834
1835     mask  = _mm256_cmp_ps(xabs,half,_CMP_GT_OQ);
1836
1837     z1    = _mm256_mul_ps(half, _mm256_sub_ps(one,xabs));
1838     q1    = _mm256_mul_ps(z1,gmx_mm256_invsqrt_ps(z1));
1839     q1    = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one,_CMP_EQ_OQ),q1);
1840
1841     q2    = xabs;
1842     z2    = _mm256_mul_ps(q2,q2);
1843
1844     z     = _mm256_blendv_ps(z2,z1,mask);
1845     q     = _mm256_blendv_ps(q2,q1,mask);
1846
1847     z2    = _mm256_mul_ps(z,z);
1848
1849     pA    = _mm256_mul_ps(CC5,z2);
1850     pB    = _mm256_mul_ps(CC4,z2);
1851
1852     pA    = _mm256_add_ps(pA,CC3);
1853     pB    = _mm256_add_ps(pB,CC2);
1854
1855     pA    = _mm256_mul_ps(pA,z2);
1856     pB    = _mm256_mul_ps(pB,z2);
1857
1858     pA    = _mm256_add_ps(pA,CC1);
1859     pA    = _mm256_mul_ps(pA,z);
1860
1861     z     = _mm256_add_ps(pA,pB);
1862     z     = _mm256_mul_ps(z,q);
1863     z     = _mm256_add_ps(z,q);
1864
1865     q2    = _mm256_sub_ps(halfpi,z);
1866     q2    = _mm256_sub_ps(q2,z);
1867
1868     z     = _mm256_blendv_ps(z,q2,mask);
1869
1870     mask  = _mm256_cmp_ps(xabs,limitlow,_CMP_GT_OQ);
1871     z     = _mm256_blendv_ps(xabs,z,mask);
1872
1873     z     = _mm256_xor_ps(z,sign);
1874
1875     return z;
1876 }
1877
1878 static __m128
1879 gmx_mm_asin_ps(__m128 x)
1880 {
1881     /* Same algorithm as cephes library */
1882     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1883     const __m128 limitlow  = _mm_set1_ps(1e-4f);
1884     const __m128 half      = _mm_set1_ps(0.5f);
1885     const __m128 one       = _mm_set1_ps(1.0f);
1886     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
1887
1888     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
1889     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
1890     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
1891     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
1892     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
1893
1894     __m128 sign;
1895     __m128 mask;
1896     __m128 xabs;
1897     __m128 z,z1,z2,q,q1,q2;
1898     __m128 pA,pB;
1899
1900     sign  = _mm_andnot_ps(signmask,x);
1901     xabs  = _mm_and_ps(x,signmask);
1902
1903     mask  = _mm_cmpgt_ps(xabs,half);
1904
1905     z1    = _mm_mul_ps(half, _mm_sub_ps(one,xabs));
1906     q1    = _mm_mul_ps(z1,gmx_mm_invsqrt_ps(z1));
1907     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one),q1);
1908
1909     q2    = xabs;
1910     z2    = _mm_mul_ps(q2,q2);
1911
1912     z     = _mm_or_ps( _mm_and_ps(mask,z1) , _mm_andnot_ps(mask,z2) );
1913     q     = _mm_or_ps( _mm_and_ps(mask,q1) , _mm_andnot_ps(mask,q2) );
1914
1915     z2    = _mm_mul_ps(z,z);
1916
1917     pA    = _mm_mul_ps(CC5,z2);
1918     pB    = _mm_mul_ps(CC4,z2);
1919
1920     pA    = _mm_add_ps(pA,CC3);
1921     pB    = _mm_add_ps(pB,CC2);
1922
1923     pA    = _mm_mul_ps(pA,z2);
1924     pB    = _mm_mul_ps(pB,z2);
1925
1926     pA    = _mm_add_ps(pA,CC1);
1927     pA    = _mm_mul_ps(pA,z);
1928
1929     z     = _mm_add_ps(pA,pB);
1930     z     = _mm_mul_ps(z,q);
1931     z     = _mm_add_ps(z,q);
1932
1933     q2    = _mm_sub_ps(halfpi,z);
1934     q2    = _mm_sub_ps(q2,z);
1935
1936     z     = _mm_or_ps( _mm_and_ps(mask,q2) , _mm_andnot_ps(mask,z) );
1937
1938     mask  = _mm_cmpgt_ps(xabs,limitlow);
1939     z     = _mm_or_ps( _mm_and_ps(mask,z) , _mm_andnot_ps(mask,xabs) );
1940
1941     z = _mm_xor_ps(z,sign);
1942
1943     return z;
1944 }
1945
1946
1947 static __m256
1948 gmx_mm256_acos_ps(__m256 x)
1949 {
1950     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1951     const __m256 one_ps    = _mm256_set1_ps(1.0f);
1952     const __m256 half_ps   = _mm256_set1_ps(0.5f);
1953     const __m256 pi_ps     = _mm256_set1_ps((float)M_PI);
1954     const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1955
1956     __m256 mask1;
1957     __m256 mask2;
1958     __m256 xabs;
1959     __m256 z,z1,z2,z3;
1960
1961     xabs  = _mm256_and_ps(x,signmask);
1962     mask1 = _mm256_cmp_ps(xabs,half_ps,_CMP_GT_OQ);
1963     mask2 = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_GT_OQ);
1964
1965     z     = _mm256_mul_ps(half_ps,_mm256_sub_ps(one_ps,xabs));
1966     z     = _mm256_mul_ps(z,gmx_mm256_invsqrt_ps(z));
1967     z     = _mm256_andnot_ps(_mm256_cmp_ps(xabs,one_ps,_CMP_EQ_OQ),z);
1968
1969     z     = _mm256_blendv_ps(x,z,mask1);
1970     z     = gmx_mm256_asin_ps(z);
1971
1972     z2    = _mm256_add_ps(z,z);
1973     z1    = _mm256_sub_ps(pi_ps,z2);
1974     z3    = _mm256_sub_ps(halfpi_ps,z);
1975
1976     z     = _mm256_blendv_ps(z1,z2,mask2);
1977     z     = _mm256_blendv_ps(z3,z,mask1);
1978
1979     return z;
1980 }
1981
1982 static __m128
1983 gmx_mm_acos_ps(__m128 x)
1984 {
1985     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1986     const __m128 one_ps    = _mm_set1_ps(1.0f);
1987     const __m128 half_ps   = _mm_set1_ps(0.5f);
1988     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1989     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1990
1991     __m128 mask1;
1992     __m128 mask2;
1993     __m128 xabs;
1994     __m128 z,z1,z2,z3;
1995
1996     xabs  = _mm_and_ps(x,signmask);
1997     mask1 = _mm_cmpgt_ps(xabs,half_ps);
1998     mask2 = _mm_cmpgt_ps(x,_mm_setzero_ps());
1999
2000     z     = _mm_mul_ps(half_ps,_mm_sub_ps(one_ps,xabs));
2001     z     = _mm_mul_ps(z,gmx_mm_invsqrt_ps(z));
2002     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs,one_ps),z);
2003
2004     z     = _mm_blendv_ps(x,z,mask1);
2005     z     = gmx_mm_asin_ps(z);
2006
2007     z2    = _mm_add_ps(z,z);
2008     z1    = _mm_sub_ps(pi_ps,z2);
2009     z3    = _mm_sub_ps(halfpi_ps,z);
2010
2011     z     = _mm_blendv_ps(z1,z2,mask2);
2012     z     = _mm_blendv_ps(z3,z,mask1);
2013
2014     return z;
2015 }
2016
2017
2018 static __m256
2019 gmx_mm256_atan_ps(__m256 x)
2020 {
2021     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2022     const __m256 limit1    = _mm256_set1_ps(0.414213562373095f);
2023     const __m256 limit2    = _mm256_set1_ps(2.414213562373095f);
2024     const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2025     const __m256 halfpi    = _mm256_set1_ps(1.570796326794896f);
2026     const __m256 mone      = _mm256_set1_ps(-1.0f);
2027     const __m256 CC3       = _mm256_set1_ps(-3.33329491539E-1f);
2028     const __m256 CC5       = _mm256_set1_ps(1.99777106478E-1f);
2029     const __m256 CC7       = _mm256_set1_ps(-1.38776856032E-1);
2030     const __m256 CC9       = _mm256_set1_ps(8.05374449538e-2f);
2031
2032     __m256 sign;
2033     __m256 mask1,mask2;
2034     __m256 y,z1,z2;
2035     __m256 x2,x4;
2036     __m256 sum1,sum2;
2037
2038     sign  = _mm256_andnot_ps(signmask,x);
2039     x     = _mm256_and_ps(x,signmask);
2040
2041     mask1 = _mm256_cmp_ps(x,limit1,_CMP_GT_OQ);
2042     mask2 = _mm256_cmp_ps(x,limit2,_CMP_GT_OQ);
2043
2044     z1    = _mm256_mul_ps(_mm256_add_ps(x,mone),gmx_mm256_inv_ps(_mm256_sub_ps(x,mone)));
2045     z2    = _mm256_mul_ps(mone,gmx_mm256_inv_ps(x));
2046
2047     y     = _mm256_and_ps(mask1,quarterpi);
2048     y     = _mm256_blendv_ps(y,halfpi,mask2);
2049
2050     x     = _mm256_blendv_ps(x,z1,mask1);
2051     x     = _mm256_blendv_ps(x,z2,mask2);
2052
2053     x2    = _mm256_mul_ps(x,x);
2054     x4    = _mm256_mul_ps(x2,x2);
2055
2056     sum1  = _mm256_mul_ps(CC9,x4);
2057     sum2  = _mm256_mul_ps(CC7,x4);
2058     sum1  = _mm256_add_ps(sum1,CC5);
2059     sum2  = _mm256_add_ps(sum2,CC3);
2060     sum1  = _mm256_mul_ps(sum1,x4);
2061     sum2  = _mm256_mul_ps(sum2,x2);
2062
2063     sum1  = _mm256_add_ps(sum1,sum2);
2064     sum1  = _mm256_sub_ps(sum1,mone);
2065     sum1  = _mm256_mul_ps(sum1,x);
2066     y     = _mm256_add_ps(y,sum1);
2067
2068     y     = _mm256_xor_ps(y,sign);
2069
2070     return y;
2071 }
2072
2073 static __m128
2074 gmx_mm_atan_ps(__m128 x)
2075 {
2076     /* Same algorithm as cephes library */
2077     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2078     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
2079     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
2080     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2081     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
2082     const __m128 mone      = _mm_set1_ps(-1.0f);
2083     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
2084     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
2085     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
2086     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
2087
2088     __m128 sign;
2089     __m128 mask1,mask2;
2090     __m128 y,z1,z2;
2091     __m128 x2,x4;
2092     __m128 sum1,sum2;
2093
2094     sign  = _mm_andnot_ps(signmask,x);
2095     x     = _mm_and_ps(x,signmask);
2096
2097     mask1 = _mm_cmpgt_ps(x,limit1);
2098     mask2 = _mm_cmpgt_ps(x,limit2);
2099
2100     z1    = _mm_mul_ps(_mm_add_ps(x,mone),gmx_mm_inv_ps(_mm_sub_ps(x,mone)));
2101     z2    = _mm_mul_ps(mone,gmx_mm_inv_ps(x));
2102
2103     y     = _mm_and_ps(mask1,quarterpi);
2104     y     = _mm_blendv_ps(y,halfpi,mask2);
2105
2106     x     = _mm_blendv_ps(x,z1,mask1);
2107     x     = _mm_blendv_ps(x,z2,mask2);
2108
2109     x2    = _mm_mul_ps(x,x);
2110     x4    = _mm_mul_ps(x2,x2);
2111
2112     sum1  = _mm_mul_ps(CC9,x4);
2113     sum2  = _mm_mul_ps(CC7,x4);
2114     sum1  = _mm_add_ps(sum1,CC5);
2115     sum2  = _mm_add_ps(sum2,CC3);
2116     sum1  = _mm_mul_ps(sum1,x4);
2117     sum2  = _mm_mul_ps(sum2,x2);
2118
2119     sum1  = _mm_add_ps(sum1,sum2);
2120     sum1  = _mm_sub_ps(sum1,mone);
2121     sum1  = _mm_mul_ps(sum1,x);
2122     y     = _mm_add_ps(y,sum1);
2123
2124     y     = _mm_xor_ps(y,sign);
2125
2126     return y;
2127 }
2128
2129
2130 static __m256
2131 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2132 {
2133     const __m256 pi          = _mm256_set1_ps( (float) M_PI);
2134     const __m256 minuspi     = _mm256_set1_ps( (float) -M_PI);
2135     const __m256 halfpi      = _mm256_set1_ps( (float) M_PI/2.0f);
2136     const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2137
2138     __m256 z,z1,z3,z4;
2139     __m256 w;
2140     __m256 maskx_lt,maskx_eq;
2141     __m256 masky_lt,masky_eq;
2142     __m256 mask1,mask2,mask3,mask4,maskall;
2143
2144     maskx_lt  = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_LT_OQ);
2145     masky_lt  = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_LT_OQ);
2146     maskx_eq  = _mm256_cmp_ps(x,_mm256_setzero_ps(),_CMP_EQ_OQ);
2147     masky_eq  = _mm256_cmp_ps(y,_mm256_setzero_ps(),_CMP_EQ_OQ);
2148
2149     z         = _mm256_mul_ps(y,gmx_mm256_inv_ps(x));
2150     z         = gmx_mm256_atan_ps(z);
2151
2152     mask1     = _mm256_and_ps(maskx_eq,masky_lt);
2153     mask2     = _mm256_andnot_ps(maskx_lt,masky_eq);
2154     mask3     = _mm256_andnot_ps( _mm256_or_ps(masky_lt,masky_eq) , maskx_eq);
2155     mask4     = _mm256_and_ps(maskx_lt,masky_eq);
2156     maskall   = _mm256_or_ps( _mm256_or_ps(mask1,mask2), _mm256_or_ps(mask3,mask4) );
2157
2158     z         = _mm256_andnot_ps(maskall,z);
2159     z1        = _mm256_and_ps(mask1,minushalfpi);
2160     z3        = _mm256_and_ps(mask3,halfpi);
2161     z4        = _mm256_and_ps(mask4,pi);
2162
2163     z         = _mm256_or_ps( _mm256_or_ps(z,z1), _mm256_or_ps(z3,z4) );
2164
2165     w         = _mm256_blendv_ps(pi,minuspi,masky_lt);
2166     w         = _mm256_and_ps(w,maskx_lt);
2167
2168     w         = _mm256_andnot_ps(maskall,w);
2169
2170     z         = _mm256_add_ps(z,w);
2171
2172     return z;
2173 }
2174
2175 static __m128
2176 gmx_mm_atan2_ps(__m128 y, __m128 x)
2177 {
2178     const __m128 pi          = _mm_set1_ps(M_PI);
2179     const __m128 minuspi     = _mm_set1_ps(-M_PI);
2180     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
2181     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2182
2183     __m128 z,z1,z3,z4;
2184     __m128 w;
2185     __m128 maskx_lt,maskx_eq;
2186     __m128 masky_lt,masky_eq;
2187     __m128 mask1,mask2,mask3,mask4,maskall;
2188
2189     maskx_lt  = _mm_cmplt_ps(x,_mm_setzero_ps());
2190     masky_lt  = _mm_cmplt_ps(y,_mm_setzero_ps());
2191     maskx_eq  = _mm_cmpeq_ps(x,_mm_setzero_ps());
2192     masky_eq  = _mm_cmpeq_ps(y,_mm_setzero_ps());
2193
2194     z         = _mm_mul_ps(y,gmx_mm_inv_ps(x));
2195     z         = gmx_mm_atan_ps(z);
2196
2197     mask1     = _mm_and_ps(maskx_eq,masky_lt);
2198     mask2     = _mm_andnot_ps(maskx_lt,masky_eq);
2199     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt,masky_eq) , maskx_eq);
2200     mask4     = _mm_and_ps(masky_eq,maskx_lt);
2201
2202     maskall   = _mm_or_ps( _mm_or_ps(mask1,mask2), _mm_or_ps(mask3,mask4) );
2203
2204     z         = _mm_andnot_ps(maskall,z);
2205     z1        = _mm_and_ps(mask1,minushalfpi);
2206     z3        = _mm_and_ps(mask3,halfpi);
2207     z4        = _mm_and_ps(mask4,pi);
2208
2209     z         = _mm_or_ps( _mm_or_ps(z,z1), _mm_or_ps(z3,z4) );
2210
2211     mask1     = _mm_andnot_ps(masky_lt,maskx_lt);
2212     mask2     = _mm_and_ps(maskx_lt,masky_lt);
2213
2214     w         = _mm_or_ps( _mm_and_ps(mask1,pi), _mm_and_ps(mask2,minuspi) );
2215     w         = _mm_andnot_ps(maskall,w);
2216
2217     z         = _mm_add_ps(z,w);
2218
2219     return z;
2220 }
2221
2222 #endif /* _gmx_math_x86_avx_256_single_h_ */