src/gromacs/simd/math_x86_avx_256_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_MATH_AVX_256_SINGLE_H
  36 #define GMX_SIMD_MATH_AVX_256_SINGLE_H
  37
  38 #include <math.h>
  39
  40 #include "general_x86_avx_256.h"
  41
  42 #ifndef M_PI
  43 #  define M_PI 3.14159265358979323846264338327950288
  44 #endif
  45
  46
  47
  48 /************************
  49  *                      *
  50  * Simple math routines *
  51  *                      *
  52  ************************/
  53
  54 /* 1.0/sqrt(x), 256-bit wide version */
  55 static gmx_inline __m256
  56 gmx_mm256_invsqrt_ps(__m256 x)
  57 {
  58     const __m256 half  = _mm256_set1_ps(0.5f);
  59     const __m256 three = _mm256_set1_ps(3.0f);
  60
  61     __m256       lu = _mm256_rsqrt_ps(x);
  62
  63     return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
  64 }
  65
  66 /* 1.0/sqrt(x), 128-bit wide version */
  67 static gmx_inline __m128
  68 gmx_mm_invsqrt_ps(__m128 x)
  69 {
  70     const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
  71     const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
  72
  73     __m128       lu = _mm_rsqrt_ps(x);
  74
  75     return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
  76 }
  77
  78
  79 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  80 static gmx_inline __m256
  81 gmx_mm256_sqrt_ps(__m256 x)
  82 {
  83     __m256 mask;
  84     __m256 res;
  85
  86     mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
  87     res  = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
  88
  89     res  = _mm256_mul_ps(x, res);
  90
  91     return res;
  92 }
  93
  94 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  95 static gmx_inline __m128
  96 gmx_mm_sqrt_ps(__m128 x)
  97 {
  98     __m128 mask;
  99     __m128 res;
 100
 101     mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
 102     res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
 103
 104     res  = _mm_mul_ps(x, res);
 105
 106     return res;
 107 }
 108
 109
 110 /* 1.0/x, 256-bit wide */
 111 static gmx_inline __m256
 112 gmx_mm256_inv_ps(__m256 x)
 113 {
 114     const __m256 two = _mm256_set1_ps(2.0f);
 115
 116     __m256       lu = _mm256_rcp_ps(x);
 117
 118     return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
 119 }
 120
 121 /* 1.0/x, 128-bit wide */
 122 static gmx_inline __m128
 123 gmx_mm_inv_ps(__m128 x)
 124 {
 125     const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
 126
 127     __m128       lu = _mm_rcp_ps(x);
 128
 129     return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
 130 }
 131
 132
 133 static gmx_inline __m256
 134 gmx_mm256_abs_ps(__m256 x)
 135 {
 136     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
 137
 138     return _mm256_and_ps(x, signmask);
 139 }
 140
 141 static gmx_inline __m128
 142 gmx_mm_abs_ps(__m128 x)
 143 {
 144     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 145
 146     return _mm_and_ps(x, signmask);
 147 }
 148
 149
 150 static __m256
 151 gmx_mm256_log_ps(__m256 x)
 152 {
 153     const __m256  expmask    = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
 154     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 155     const __m256  half       = _mm256_set1_ps(0.5f);
 156     const __m256  one        = _mm256_set1_ps(1.0f);
 157     const __m256  invsq2     = _mm256_set1_ps(1.0f/sqrt(2.0f));
 158     const __m256  corr1      = _mm256_set1_ps(-2.12194440e-4f);
 159     const __m256  corr2      = _mm256_set1_ps(0.693359375f);
 160
 161     const __m256  CA_1        = _mm256_set1_ps(0.070376836292f);
 162     const __m256  CB_0        = _mm256_set1_ps(1.6714950086782716f);
 163     const __m256  CB_1        = _mm256_set1_ps(-2.452088066061482f);
 164     const __m256  CC_0        = _mm256_set1_ps(1.5220770854701728f);
 165     const __m256  CC_1        = _mm256_set1_ps(-1.3422238433233642f);
 166     const __m256  CD_0        = _mm256_set1_ps(1.386218787509749f);
 167     const __m256  CD_1        = _mm256_set1_ps(0.35075468953796346f);
 168     const __m256  CE_0        = _mm256_set1_ps(1.3429983063133937f);
 169     const __m256  CE_1        = _mm256_set1_ps(1.807420826584643f);
 170
 171     __m256        fexp;
 172     __m256i       iexp;
 173     __m128i       iexp128a, iexp128b;
 174     __m256        mask;
 175     __m256i       imask;
 176     __m128i       imask128a, imask128b;
 177     __m256        x2;
 178     __m256        y;
 179     __m256        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 180
 181     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 182     fexp  = _mm256_and_ps(x, expmask);
 183     iexp  = _mm256_castps_si256(fexp);
 184
 185     iexp128b = _mm256_extractf128_si256(iexp, 0x1);
 186     iexp128a = _mm256_castsi256_si128(iexp);
 187
 188     iexp128a  = _mm_srli_epi32(iexp128a, 23);
 189     iexp128b  = _mm_srli_epi32(iexp128b, 23);
 190     iexp128a  = _mm_sub_epi32(iexp128a, expbase_m1);
 191     iexp128b  = _mm_sub_epi32(iexp128b, expbase_m1);
 192
 193     x     = _mm256_andnot_ps(expmask, x);
 194     x     = _mm256_or_ps(x, one);
 195     x     = _mm256_mul_ps(x, half);
 196
 197     mask  = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
 198
 199     x     = _mm256_add_ps(x, _mm256_and_ps(mask, x));
 200     x     = _mm256_sub_ps(x, one);
 201
 202     imask = _mm256_castps_si256(mask);
 203
 204     imask128b = _mm256_extractf128_si256(imask, 0x1);
 205     imask128a = _mm256_castsi256_si128(imask);
 206
 207     iexp128a  = _mm_add_epi32(iexp128a, imask128a);
 208     iexp128b  = _mm_add_epi32(iexp128b, imask128b);
 209
 210     iexp  = _mm256_castsi128_si256(iexp128a);
 211     iexp  = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
 212
 213     x2    = _mm256_mul_ps(x, x);
 214
 215     pA    = _mm256_mul_ps(CA_1, x);
 216     pB    = _mm256_mul_ps(CB_1, x);
 217     pC    = _mm256_mul_ps(CC_1, x);
 218     pD    = _mm256_mul_ps(CD_1, x);
 219     pE    = _mm256_mul_ps(CE_1, x);
 220     tB    = _mm256_add_ps(CB_0, x2);
 221     tC    = _mm256_add_ps(CC_0, x2);
 222     tD    = _mm256_add_ps(CD_0, x2);
 223     tE    = _mm256_add_ps(CE_0, x2);
 224     pB    = _mm256_add_ps(pB, tB);
 225     pC    = _mm256_add_ps(pC, tC);
 226     pD    = _mm256_add_ps(pD, tD);
 227     pE    = _mm256_add_ps(pE, tE);
 228
 229     pA    = _mm256_mul_ps(pA, pB);
 230     pC    = _mm256_mul_ps(pC, pD);
 231     pE    = _mm256_mul_ps(pE, x2);
 232     pA    = _mm256_mul_ps(pA, pC);
 233     y     = _mm256_mul_ps(pA, pE);
 234
 235     fexp  = _mm256_cvtepi32_ps(iexp);
 236     y     = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
 237
 238     y     = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
 239     x2    = _mm256_add_ps(x, y);
 240
 241     x2    = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
 242
 243     return x2;
 244 }
 245
 246
 247 static __m128
 248 gmx_mm_log_ps(__m128 x)
 249 {
 250     /* Same algorithm as cephes library */
 251     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 252     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 253     const __m128  half       = _mm_set1_ps(0.5f);
 254     const __m128  one        = _mm_set1_ps(1.0f);
 255     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 256     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 257     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 258
 259     const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 260     const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 261     const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 262     const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 263     const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 264     const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 265     const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 266     const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 267     const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 268
 269     __m128        fexp;
 270     __m128i       iexp;
 271     __m128        mask;
 272     __m128        x2;
 273     __m128        y;
 274     __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 275
 276     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 277     fexp  = _mm_and_ps(x, expmask);
 278     iexp  = gmx_mm_castps_si128(fexp);
 279     iexp  = _mm_srli_epi32(iexp, 23);
 280     iexp  = _mm_sub_epi32(iexp, expbase_m1);
 281
 282     x     = _mm_andnot_ps(expmask, x);
 283     x     = _mm_or_ps(x, one);
 284     x     = _mm_mul_ps(x, half);
 285
 286     mask  = _mm_cmplt_ps(x, invsq2);
 287
 288     x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 289     x     = _mm_sub_ps(x, one);
 290     iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 291
 292     x2    = _mm_mul_ps(x, x);
 293
 294     pA    = _mm_mul_ps(CA_1, x);
 295     pB    = _mm_mul_ps(CB_1, x);
 296     pC    = _mm_mul_ps(CC_1, x);
 297     pD    = _mm_mul_ps(CD_1, x);
 298     pE    = _mm_mul_ps(CE_1, x);
 299     tB    = _mm_add_ps(CB_0, x2);
 300     tC    = _mm_add_ps(CC_0, x2);
 301     tD    = _mm_add_ps(CD_0, x2);
 302     tE    = _mm_add_ps(CE_0, x2);
 303     pB    = _mm_add_ps(pB, tB);
 304     pC    = _mm_add_ps(pC, tC);
 305     pD    = _mm_add_ps(pD, tD);
 306     pE    = _mm_add_ps(pE, tE);
 307
 308     pA    = _mm_mul_ps(pA, pB);
 309     pC    = _mm_mul_ps(pC, pD);
 310     pE    = _mm_mul_ps(pE, x2);
 311     pA    = _mm_mul_ps(pA, pC);
 312     y     = _mm_mul_ps(pA, pE);
 313
 314     fexp  = _mm_cvtepi32_ps(iexp);
 315     y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
 316
 317     y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
 318     x2    = _mm_add_ps(x, y);
 319
 320     x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
 321
 322     return x2;
 323 }
 324
 325
 326 /*
 327  * 2^x function, 256-bit wide
 328  *
 329  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 330  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 331  *
 332  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 333  *
 334  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 335  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 336  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 337  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 338  * number instead. That would take a few extra cycles and not really help, since something is
 339  * wrong if you are using single precision to work with numbers that cannot really be represented
 340  * in single precision.
 341  *
 342  * The accuracy is at least 23 bits.
 343  */
 344 static __m256
 345 gmx_mm256_exp2_ps(__m256 x)
 346 {
 347     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 348     const __m256  arglimit = _mm256_set1_ps(126.0f);
 349
 350     const __m128i expbase  = _mm_set1_epi32(127);
 351     const __m256  CC6      = _mm256_set1_ps(1.535336188319500E-004);
 352     const __m256  CC5      = _mm256_set1_ps(1.339887440266574E-003);
 353     const __m256  CC4      = _mm256_set1_ps(9.618437357674640E-003);
 354     const __m256  CC3      = _mm256_set1_ps(5.550332471162809E-002);
 355     const __m256  CC2      = _mm256_set1_ps(2.402264791363012E-001);
 356     const __m256  CC1      = _mm256_set1_ps(6.931472028550421E-001);
 357     const __m256  CC0      = _mm256_set1_ps(1.0f);
 358
 359     __m256        p0, p1;
 360     __m256        valuemask;
 361     __m256i       iexppart;
 362     __m128i       iexppart128a, iexppart128b;
 363     __m256        fexppart;
 364     __m256        intpart;
 365     __m256        x2;
 366
 367
 368     iexppart  = _mm256_cvtps_epi32(x);
 369     intpart   = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 370
 371     iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
 372     iexppart128a = _mm256_castsi256_si128(iexppart);
 373
 374     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
 375     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
 376
 377     iexppart  = _mm256_castsi128_si256(iexppart128a);
 378     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 379     valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
 380     fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
 381
 382     x         = _mm256_sub_ps(x, intpart);
 383     x2        = _mm256_mul_ps(x, x);
 384
 385     p0        = _mm256_mul_ps(CC6, x2);
 386     p1        = _mm256_mul_ps(CC5, x2);
 387     p0        = _mm256_add_ps(p0, CC4);
 388     p1        = _mm256_add_ps(p1, CC3);
 389     p0        = _mm256_mul_ps(p0, x2);
 390     p1        = _mm256_mul_ps(p1, x2);
 391     p0        = _mm256_add_ps(p0, CC2);
 392     p1        = _mm256_add_ps(p1, CC1);
 393     p0        = _mm256_mul_ps(p0, x2);
 394     p1        = _mm256_mul_ps(p1, x);
 395     p0        = _mm256_add_ps(p0, CC0);
 396     p0        = _mm256_add_ps(p0, p1);
 397     x         = _mm256_mul_ps(p0, fexppart);
 398
 399     return x;
 400 }
 401
 402
 403 /* 2^x, 128 bit wide */
 404 static __m128
 405 gmx_mm_exp2_ps(__m128 x)
 406 {
 407     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 408     const __m128  arglimit = _mm_set1_ps(126.0f);
 409
 410     const __m128i expbase  = _mm_set1_epi32(127);
 411     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 412     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 413     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 414     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 415     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 416     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 417     const __m128  CA0      = _mm_set1_ps(1.0f);
 418
 419     __m128        valuemask;
 420     __m128i       iexppart;
 421     __m128        fexppart;
 422     __m128        intpart;
 423     __m128        x2;
 424     __m128        p0, p1;
 425
 426     iexppart  = _mm_cvtps_epi32(x);
 427     intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 428     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 429     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
 430     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 431
 432     x         = _mm_sub_ps(x, intpart);
 433     x2        = _mm_mul_ps(x, x);
 434
 435     p0        = _mm_mul_ps(CA6, x2);
 436     p1        = _mm_mul_ps(CA5, x2);
 437     p0        = _mm_add_ps(p0, CA4);
 438     p1        = _mm_add_ps(p1, CA3);
 439     p0        = _mm_mul_ps(p0, x2);
 440     p1        = _mm_mul_ps(p1, x2);
 441     p0        = _mm_add_ps(p0, CA2);
 442     p1        = _mm_add_ps(p1, CA1);
 443     p0        = _mm_mul_ps(p0, x2);
 444     p1        = _mm_mul_ps(p1, x);
 445     p0        = _mm_add_ps(p0, CA0);
 446     p0        = _mm_add_ps(p0, p1);
 447     x         = _mm_mul_ps(p0, fexppart);
 448
 449     return x;
 450 }
 451
 452
 453 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
 454  * where y=log2(e)*x, but there will then be a small rounding error since we lose some
 455  * precision due to the multiplication. This will then be magnified a lot by the exponential.
 456  *
 457  * Instead, we calculate the fractional part directly as a minimax approximation of
 458  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 459  * remaining after 2^y, which avoids the precision-loss.
 460  * The final result is correct to within 1 LSB over the entire argument range.
 461  */
 462 static __m256
 463 gmx_mm256_exp_ps(__m256 exparg)
 464 {
 465     const __m256  argscale      = _mm256_set1_ps(1.44269504088896341f);
 466     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 467     const __m256  arglimit      = _mm256_set1_ps(126.0f);
 468     const __m128i expbase       = _mm_set1_epi32(127);
 469
 470     const __m256  invargscale0  = _mm256_set1_ps(0.693359375f);
 471     const __m256  invargscale1  = _mm256_set1_ps(-2.12194440e-4f);
 472
 473     const __m256  CE5           = _mm256_set1_ps(1.9875691500e-4f);
 474     const __m256  CE4           = _mm256_set1_ps(1.3981999507e-3f);
 475     const __m256  CE3           = _mm256_set1_ps(8.3334519073e-3f);
 476     const __m256  CE2           = _mm256_set1_ps(4.1665795894e-2f);
 477     const __m256  CE1           = _mm256_set1_ps(1.6666665459e-1f);
 478     const __m256  CE0           = _mm256_set1_ps(5.0000001201e-1f);
 479     const __m256  one           = _mm256_set1_ps(1.0f);
 480
 481     __m256        exparg2, exp2arg;
 482     __m256        pE0, pE1;
 483     __m256        valuemask;
 484     __m256i       iexppart;
 485     __m128i       iexppart128a, iexppart128b;
 486     __m256        fexppart;
 487     __m256        intpart;
 488
 489     exp2arg = _mm256_mul_ps(exparg, argscale);
 490
 491     iexppart  = _mm256_cvtps_epi32(exp2arg);
 492     intpart   = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
 493
 494     iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
 495     iexppart128a = _mm256_castsi256_si128(iexppart);
 496
 497     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
 498     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
 499
 500     iexppart  = _mm256_castsi128_si256(iexppart128a);
 501     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 502     valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
 503     fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
 504
 505     /* Extended precision arithmetics */
 506     exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
 507     exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
 508
 509     exparg2   = _mm256_mul_ps(exparg, exparg);
 510
 511     pE1       = _mm256_mul_ps(CE5, exparg2);
 512     pE0       = _mm256_mul_ps(CE4, exparg2);
 513     pE1       = _mm256_add_ps(pE1, CE3);
 514     pE0       = _mm256_add_ps(pE0, CE2);
 515     pE1       = _mm256_mul_ps(pE1, exparg2);
 516     pE0       = _mm256_mul_ps(pE0, exparg2);
 517     pE1       = _mm256_add_ps(pE1, CE1);
 518     pE0       = _mm256_add_ps(pE0, CE0);
 519     pE1       = _mm256_mul_ps(pE1, exparg);
 520     pE0       = _mm256_add_ps(pE0, pE1);
 521     pE0       = _mm256_mul_ps(pE0, exparg2);
 522     exparg    = _mm256_add_ps(exparg, one);
 523     exparg    = _mm256_add_ps(exparg, pE0);
 524
 525     exparg    = _mm256_mul_ps(exparg, fexppart);
 526
 527     return exparg;
 528 }
 529
 530
 531 /* exp(), 128 bit wide. */
 532 static __m128
 533 gmx_mm_exp_ps(__m128 x)
 534 {
 535     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 536     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 537     const __m128  arglimit      = _mm_set1_ps(126.0f);
 538     const __m128i expbase       = _mm_set1_epi32(127);
 539
 540     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 541     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 542
 543     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 544     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 545     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 546     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 547     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 548     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 549     const __m128  one           = _mm_set1_ps(1.0f);
 550
 551     __m128        y, x2;
 552     __m128        p0, p1;
 553     __m128        valuemask;
 554     __m128i       iexppart;
 555     __m128        fexppart;
 556     __m128        intpart;
 557
 558     y = _mm_mul_ps(x, argscale);
 559
 560     iexppart  = _mm_cvtps_epi32(y);
 561     intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
 562
 563     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 564     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
 565     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 566
 567     /* Extended precision arithmetics */
 568     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
 569     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
 570
 571     x2        = _mm_mul_ps(x, x);
 572
 573     p1        = _mm_mul_ps(CC5, x2);
 574     p0        = _mm_mul_ps(CC4, x2);
 575     p1        = _mm_add_ps(p1, CC3);
 576     p0        = _mm_add_ps(p0, CC2);
 577     p1        = _mm_mul_ps(p1, x2);
 578     p0        = _mm_mul_ps(p0, x2);
 579     p1        = _mm_add_ps(p1, CC1);
 580     p0        = _mm_add_ps(p0, CC0);
 581     p1        = _mm_mul_ps(p1, x);
 582     p0        = _mm_add_ps(p0, p1);
 583     p0        = _mm_mul_ps(p0, x2);
 584     x         = _mm_add_ps(x, one);
 585     x         = _mm_add_ps(x, p0);
 586
 587     x         = _mm_mul_ps(x, fexppart);
 588
 589     return x;
 590 }
 591
 592
 593
 594 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
 595 static __m256
 596 gmx_mm256_erf_ps(__m256 x)
 597 {
 598     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 599     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 600     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 601     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 602     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 603     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 604     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 605     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 606     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 607     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 608     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 609     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 610     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 611     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 612     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 613     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 614     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 615     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 616     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 617     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 618     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 619     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 620     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 621     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 622     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 623     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 624     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 625     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 626     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 627     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 628     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 629
 630     /* Coefficients for expansion of exp(x) in [0,0.1] */
 631     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 632     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 633     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 634     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 635
 636     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 637     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 638     const __m256  one      = _mm256_set1_ps(1.0f);
 639     const __m256  two      = _mm256_set1_ps(2.0f);
 640
 641     __m256        x2, x4, y;
 642     __m256        z, q, t, t2, w, w2;
 643     __m256        pA0, pA1, pB0, pB1, pC0, pC1;
 644     __m256        expmx2, corr;
 645     __m256        res_erf, res_erfc, res;
 646     __m256        mask;
 647
 648     /* Calculate erf() */
 649     x2     = _mm256_mul_ps(x, x);
 650     x4     = _mm256_mul_ps(x2, x2);
 651
 652     pA0  = _mm256_mul_ps(CA6, x4);
 653     pA1  = _mm256_mul_ps(CA5, x4);
 654     pA0  = _mm256_add_ps(pA0, CA4);
 655     pA1  = _mm256_add_ps(pA1, CA3);
 656     pA0  = _mm256_mul_ps(pA0, x4);
 657     pA1  = _mm256_mul_ps(pA1, x4);
 658     pA0  = _mm256_add_ps(pA0, CA2);
 659     pA1  = _mm256_add_ps(pA1, CA1);
 660     pA0  = _mm256_mul_ps(pA0, x4);
 661     pA1  = _mm256_mul_ps(pA1, x2);
 662     pA0  = _mm256_add_ps(pA0, pA1);
 663     pA0  = _mm256_add_ps(pA0, CA0);
 664
 665     res_erf = _mm256_mul_ps(x, pA0);
 666
 667     /* Calculate erfc */
 668
 669     y       = gmx_mm256_abs_ps(x);
 670     t       = gmx_mm256_inv_ps(y);
 671     w       = _mm256_sub_ps(t, one);
 672     t2      = _mm256_mul_ps(t, t);
 673     w2      = _mm256_mul_ps(w, w);
 674     /*
 675      * We cannot simply calculate exp(-x2) directly in single precision, since
 676      * that will lose a couple of bits of precision due to the multiplication.
 677      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 678      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 679      *
 680      * The only drawback with this is that it requires TWO separate exponential
 681      * evaluations, which would be horrible performance-wise. However, the argument
 682      * for the second exp() call is always small, so there we simply use a
 683      * low-order minimax expansion on [0,0.1].
 684      */
 685
 686     z       = _mm256_and_ps(y, sieve);
 687     q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
 688
 689     corr    = _mm256_mul_ps(CD4, q);
 690     corr    = _mm256_add_ps(corr, CD3);
 691     corr    = _mm256_mul_ps(corr, q);
 692     corr    = _mm256_add_ps(corr, CD2);
 693     corr    = _mm256_mul_ps(corr, q);
 694     corr    = _mm256_add_ps(corr, one);
 695     corr    = _mm256_mul_ps(corr, q);
 696     corr    = _mm256_add_ps(corr, one);
 697
 698     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
 699     expmx2  = _mm256_mul_ps(expmx2, corr);
 700
 701     pB1  = _mm256_mul_ps(CB9, w2);
 702     pB0  = _mm256_mul_ps(CB8, w2);
 703     pB1  = _mm256_add_ps(pB1, CB7);
 704     pB0  = _mm256_add_ps(pB0, CB6);
 705     pB1  = _mm256_mul_ps(pB1, w2);
 706     pB0  = _mm256_mul_ps(pB0, w2);
 707     pB1  = _mm256_add_ps(pB1, CB5);
 708     pB0  = _mm256_add_ps(pB0, CB4);
 709     pB1  = _mm256_mul_ps(pB1, w2);
 710     pB0  = _mm256_mul_ps(pB0, w2);
 711     pB1  = _mm256_add_ps(pB1, CB3);
 712     pB0  = _mm256_add_ps(pB0, CB2);
 713     pB1  = _mm256_mul_ps(pB1, w2);
 714     pB0  = _mm256_mul_ps(pB0, w2);
 715     pB1  = _mm256_add_ps(pB1, CB1);
 716     pB1  = _mm256_mul_ps(pB1, w);
 717     pB0  = _mm256_add_ps(pB0, pB1);
 718     pB0  = _mm256_add_ps(pB0, CB0);
 719
 720     pC0  = _mm256_mul_ps(CC10, t2);
 721     pC1  = _mm256_mul_ps(CC9, t2);
 722     pC0  = _mm256_add_ps(pC0, CC8);
 723     pC1  = _mm256_add_ps(pC1, CC7);
 724     pC0  = _mm256_mul_ps(pC0, t2);
 725     pC1  = _mm256_mul_ps(pC1, t2);
 726     pC0  = _mm256_add_ps(pC0, CC6);
 727     pC1  = _mm256_add_ps(pC1, CC5);
 728     pC0  = _mm256_mul_ps(pC0, t2);
 729     pC1  = _mm256_mul_ps(pC1, t2);
 730     pC0  = _mm256_add_ps(pC0, CC4);
 731     pC1  = _mm256_add_ps(pC1, CC3);
 732     pC0  = _mm256_mul_ps(pC0, t2);
 733     pC1  = _mm256_mul_ps(pC1, t2);
 734     pC0  = _mm256_add_ps(pC0, CC2);
 735     pC1  = _mm256_add_ps(pC1, CC1);
 736     pC0  = _mm256_mul_ps(pC0, t2);
 737     pC1  = _mm256_mul_ps(pC1, t);
 738     pC0  = _mm256_add_ps(pC0, pC1);
 739     pC0  = _mm256_add_ps(pC0, CC0);
 740     pC0  = _mm256_mul_ps(pC0, t);
 741
 742     /* SELECT pB0 or pC0 for erfc() */
 743     mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
 744     res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
 745     res_erfc = _mm256_mul_ps(res_erfc, expmx2);
 746
 747     /* erfc(x<0) = 2-erfc(|x|) */
 748     mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
 749     res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
 750
 751     /* Select erf() or erfc() */
 752     mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
 753     res  = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
 754
 755     return res;
 756 }
 757
 758
 759 /* erf(), 128 bit wide */
 760 static __m128
 761 gmx_mm_erf_ps(__m128 x)
 762 {
 763     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 764     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 765     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 766     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 767     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 768     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 769     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 770     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 771     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 772     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 773     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 774     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 775     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 776     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 777     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 778     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 779     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 780     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 781     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 782     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 783     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 784     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 785     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 786     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 787     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 788     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 789     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 790     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 791     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 792     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 793     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 794
 795     /* Coefficients for expansion of exp(x) in [0,0.1] */
 796     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 797     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 798     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 799     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 800
 801     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 802     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 803     const __m128  one      = _mm_set1_ps(1.0f);
 804     const __m128  two      = _mm_set1_ps(2.0f);
 805
 806     __m128        x2, x4, y;
 807     __m128        z, q, t, t2, w, w2;
 808     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 809     __m128        expmx2, corr;
 810     __m128        res_erf, res_erfc, res;
 811     __m128        mask;
 812
 813     /* Calculate erf() */
 814     x2     = _mm_mul_ps(x, x);
 815     x4     = _mm_mul_ps(x2, x2);
 816
 817     pA0  = _mm_mul_ps(CA6, x4);
 818     pA1  = _mm_mul_ps(CA5, x4);
 819     pA0  = _mm_add_ps(pA0, CA4);
 820     pA1  = _mm_add_ps(pA1, CA3);
 821     pA0  = _mm_mul_ps(pA0, x4);
 822     pA1  = _mm_mul_ps(pA1, x4);
 823     pA0  = _mm_add_ps(pA0, CA2);
 824     pA1  = _mm_add_ps(pA1, CA1);
 825     pA0  = _mm_mul_ps(pA0, x4);
 826     pA1  = _mm_mul_ps(pA1, x2);
 827     pA0  = _mm_add_ps(pA0, pA1);
 828     pA0  = _mm_add_ps(pA0, CA0);
 829
 830     res_erf = _mm_mul_ps(x, pA0);
 831
 832     /* Calculate erfc */
 833
 834     y       = gmx_mm_abs_ps(x);
 835     t       = gmx_mm_inv_ps(y);
 836     w       = _mm_sub_ps(t, one);
 837     t2      = _mm_mul_ps(t, t);
 838     w2      = _mm_mul_ps(w, w);
 839     /*
 840      * We cannot simply calculate exp(-x2) directly in single precision, since
 841      * that will lose a couple of bits of precision due to the multiplication.
 842      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 843      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 844      *
 845      * The only drawback with this is that it requires TWO separate exponential
 846      * evaluations, which would be horrible performance-wise. However, the argument
 847      * for the second exp() call is always small, so there we simply use a
 848      * low-order minimax expansion on [0,0.1].
 849      */
 850
 851     z       = _mm_and_ps(y, sieve);
 852     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 853
 854     corr    = _mm_mul_ps(CD4, q);
 855     corr    = _mm_add_ps(corr, CD3);
 856     corr    = _mm_mul_ps(corr, q);
 857     corr    = _mm_add_ps(corr, CD2);
 858     corr    = _mm_mul_ps(corr, q);
 859     corr    = _mm_add_ps(corr, one);
 860     corr    = _mm_mul_ps(corr, q);
 861     corr    = _mm_add_ps(corr, one);
 862
 863     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 864     expmx2  = _mm_mul_ps(expmx2, corr);
 865
 866     pB1  = _mm_mul_ps(CB9, w2);
 867     pB0  = _mm_mul_ps(CB8, w2);
 868     pB1  = _mm_add_ps(pB1, CB7);
 869     pB0  = _mm_add_ps(pB0, CB6);
 870     pB1  = _mm_mul_ps(pB1, w2);
 871     pB0  = _mm_mul_ps(pB0, w2);
 872     pB1  = _mm_add_ps(pB1, CB5);
 873     pB0  = _mm_add_ps(pB0, CB4);
 874     pB1  = _mm_mul_ps(pB1, w2);
 875     pB0  = _mm_mul_ps(pB0, w2);
 876     pB1  = _mm_add_ps(pB1, CB3);
 877     pB0  = _mm_add_ps(pB0, CB2);
 878     pB1  = _mm_mul_ps(pB1, w2);
 879     pB0  = _mm_mul_ps(pB0, w2);
 880     pB1  = _mm_add_ps(pB1, CB1);
 881     pB1  = _mm_mul_ps(pB1, w);
 882     pB0  = _mm_add_ps(pB0, pB1);
 883     pB0  = _mm_add_ps(pB0, CB0);
 884
 885     pC0  = _mm_mul_ps(CC10, t2);
 886     pC1  = _mm_mul_ps(CC9, t2);
 887     pC0  = _mm_add_ps(pC0, CC8);
 888     pC1  = _mm_add_ps(pC1, CC7);
 889     pC0  = _mm_mul_ps(pC0, t2);
 890     pC1  = _mm_mul_ps(pC1, t2);
 891     pC0  = _mm_add_ps(pC0, CC6);
 892     pC1  = _mm_add_ps(pC1, CC5);
 893     pC0  = _mm_mul_ps(pC0, t2);
 894     pC1  = _mm_mul_ps(pC1, t2);
 895     pC0  = _mm_add_ps(pC0, CC4);
 896     pC1  = _mm_add_ps(pC1, CC3);
 897     pC0  = _mm_mul_ps(pC0, t2);
 898     pC1  = _mm_mul_ps(pC1, t2);
 899     pC0  = _mm_add_ps(pC0, CC2);
 900     pC1  = _mm_add_ps(pC1, CC1);
 901     pC0  = _mm_mul_ps(pC0, t2);
 902     pC1  = _mm_mul_ps(pC1, t);
 903     pC0  = _mm_add_ps(pC0, pC1);
 904     pC0  = _mm_add_ps(pC0, CC0);
 905     pC0  = _mm_mul_ps(pC0, t);
 906
 907     /* SELECT pB0 or pC0 for erfc() */
 908     mask     = _mm_cmplt_ps(two, y);
 909     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 910     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 911
 912     /* erfc(x<0) = 2-erfc(|x|) */
 913     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 914     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 915
 916     /* Select erf() or erfc() */
 917     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 918     res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
 919
 920     return res;
 921 }
 922
 923
 924
 925
 926 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
 927 static __m256
 928 gmx_mm256_erfc_ps(__m256 x)
 929 {
 930     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 931     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 932     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 933     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 934     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 935     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 936     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 937     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 938     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 939     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 940     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 941     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 942     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 943     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 944     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 945     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 946     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 947     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 948     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 949     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 950     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 951     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 952     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 953     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 954     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 955     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 956     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 957     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 958     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 959     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 960     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 961
 962     /* Coefficients for expansion of exp(x) in [0,0.1] */
 963     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 964     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 965     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 966     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 967
 968     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 969     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 970     const __m256  one      = _mm256_set1_ps(1.0f);
 971     const __m256  two      = _mm256_set1_ps(2.0f);
 972
 973     __m256        x2, x4, y;
 974     __m256        z, q, t, t2, w, w2;
 975     __m256        pA0, pA1, pB0, pB1, pC0, pC1;
 976     __m256        expmx2, corr;
 977     __m256        res_erf, res_erfc, res;
 978     __m256        mask;
 979
 980     /* Calculate erf() */
 981     x2     = _mm256_mul_ps(x, x);
 982     x4     = _mm256_mul_ps(x2, x2);
 983
 984     pA0  = _mm256_mul_ps(CA6, x4);
 985     pA1  = _mm256_mul_ps(CA5, x4);
 986     pA0  = _mm256_add_ps(pA0, CA4);
 987     pA1  = _mm256_add_ps(pA1, CA3);
 988     pA0  = _mm256_mul_ps(pA0, x4);
 989     pA1  = _mm256_mul_ps(pA1, x4);
 990     pA0  = _mm256_add_ps(pA0, CA2);
 991     pA1  = _mm256_add_ps(pA1, CA1);
 992     pA0  = _mm256_mul_ps(pA0, x4);
 993     pA1  = _mm256_mul_ps(pA1, x2);
 994     pA0  = _mm256_add_ps(pA0, pA1);
 995     pA0  = _mm256_add_ps(pA0, CA0);
 996
 997     res_erf = _mm256_mul_ps(x, pA0);
 998
 999     /* Calculate erfc */
1000     y       = gmx_mm256_abs_ps(x);
1001     t       = gmx_mm256_inv_ps(y);
1002     w       = _mm256_sub_ps(t, one);
1003     t2      = _mm256_mul_ps(t, t);
1004     w2      = _mm256_mul_ps(w, w);
1005     /*
1006      * We cannot simply calculate exp(-x2) directly in single precision, since
1007      * that will lose a couple of bits of precision due to the multiplication.
1008      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1009      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1010      *
1011      * The only drawback with this is that it requires TWO separate exponential
1012      * evaluations, which would be horrible performance-wise. However, the argument
1013      * for the second exp() call is always small, so there we simply use a
1014      * low-order minimax expansion on [0,0.1].
1015      */
1016
1017     z       = _mm256_and_ps(y, sieve);
1018     q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
1019
1020     corr    = _mm256_mul_ps(CD4, q);
1021     corr    = _mm256_add_ps(corr, CD3);
1022     corr    = _mm256_mul_ps(corr, q);
1023     corr    = _mm256_add_ps(corr, CD2);
1024     corr    = _mm256_mul_ps(corr, q);
1025     corr    = _mm256_add_ps(corr, one);
1026     corr    = _mm256_mul_ps(corr, q);
1027     corr    = _mm256_add_ps(corr, one);
1028
1029     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
1030     expmx2  = _mm256_mul_ps(expmx2, corr);
1031
1032     pB1  = _mm256_mul_ps(CB9, w2);
1033     pB0  = _mm256_mul_ps(CB8, w2);
1034     pB1  = _mm256_add_ps(pB1, CB7);
1035     pB0  = _mm256_add_ps(pB0, CB6);
1036     pB1  = _mm256_mul_ps(pB1, w2);
1037     pB0  = _mm256_mul_ps(pB0, w2);
1038     pB1  = _mm256_add_ps(pB1, CB5);
1039     pB0  = _mm256_add_ps(pB0, CB4);
1040     pB1  = _mm256_mul_ps(pB1, w2);
1041     pB0  = _mm256_mul_ps(pB0, w2);
1042     pB1  = _mm256_add_ps(pB1, CB3);
1043     pB0  = _mm256_add_ps(pB0, CB2);
1044     pB1  = _mm256_mul_ps(pB1, w2);
1045     pB0  = _mm256_mul_ps(pB0, w2);
1046     pB1  = _mm256_add_ps(pB1, CB1);
1047     pB1  = _mm256_mul_ps(pB1, w);
1048     pB0  = _mm256_add_ps(pB0, pB1);
1049     pB0  = _mm256_add_ps(pB0, CB0);
1050
1051     pC0  = _mm256_mul_ps(CC10, t2);
1052     pC1  = _mm256_mul_ps(CC9, t2);
1053     pC0  = _mm256_add_ps(pC0, CC8);
1054     pC1  = _mm256_add_ps(pC1, CC7);
1055     pC0  = _mm256_mul_ps(pC0, t2);
1056     pC1  = _mm256_mul_ps(pC1, t2);
1057     pC0  = _mm256_add_ps(pC0, CC6);
1058     pC1  = _mm256_add_ps(pC1, CC5);
1059     pC0  = _mm256_mul_ps(pC0, t2);
1060     pC1  = _mm256_mul_ps(pC1, t2);
1061     pC0  = _mm256_add_ps(pC0, CC4);
1062     pC1  = _mm256_add_ps(pC1, CC3);
1063     pC0  = _mm256_mul_ps(pC0, t2);
1064     pC1  = _mm256_mul_ps(pC1, t2);
1065     pC0  = _mm256_add_ps(pC0, CC2);
1066     pC1  = _mm256_add_ps(pC1, CC1);
1067     pC0  = _mm256_mul_ps(pC0, t2);
1068     pC1  = _mm256_mul_ps(pC1, t);
1069     pC0  = _mm256_add_ps(pC0, pC1);
1070     pC0  = _mm256_add_ps(pC0, CC0);
1071     pC0  = _mm256_mul_ps(pC0, t);
1072
1073     /* SELECT pB0 or pC0 for erfc() */
1074     mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
1075     res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
1076     res_erfc = _mm256_mul_ps(res_erfc, expmx2);
1077
1078     /* erfc(x<0) = 2-erfc(|x|) */
1079     mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
1080     res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
1081
1082     /* Select erf() or erfc() */
1083     mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
1084     res  = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
1085
1086     return res;
1087 }
1088
1089
1090 /* erfc(), 128 bit wide */
1091 static __m128
1092 gmx_mm_erfc_ps(__m128 x)
1093 {
1094     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1095     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
1096     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
1097     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
1098     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
1099     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
1100     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
1101     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
1102     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1103     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
1104     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
1105     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
1106     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
1107     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
1108     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
1109     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
1110     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
1111     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
1112     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
1113     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1114     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
1115     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
1116     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
1117     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
1118     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
1119     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
1120     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
1121     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
1122     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
1123     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
1124     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
1125
1126     /* Coefficients for expansion of exp(x) in [0,0.1] */
1127     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1128     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
1129     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
1130     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
1131
1132     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1133     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1134     const __m128  one      = _mm_set1_ps(1.0f);
1135     const __m128  two      = _mm_set1_ps(2.0f);
1136
1137     __m128        x2, x4, y;
1138     __m128        z, q, t, t2, w, w2;
1139     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
1140     __m128        expmx2, corr;
1141     __m128        res_erf, res_erfc, res;
1142     __m128        mask;
1143
1144     /* Calculate erf() */
1145     x2     = _mm_mul_ps(x, x);
1146     x4     = _mm_mul_ps(x2, x2);
1147
1148     pA0  = _mm_mul_ps(CA6, x4);
1149     pA1  = _mm_mul_ps(CA5, x4);
1150     pA0  = _mm_add_ps(pA0, CA4);
1151     pA1  = _mm_add_ps(pA1, CA3);
1152     pA0  = _mm_mul_ps(pA0, x4);
1153     pA1  = _mm_mul_ps(pA1, x4);
1154     pA0  = _mm_add_ps(pA0, CA2);
1155     pA1  = _mm_add_ps(pA1, CA1);
1156     pA0  = _mm_mul_ps(pA0, x4);
1157     pA1  = _mm_mul_ps(pA1, x2);
1158     pA0  = _mm_add_ps(pA0, pA1);
1159     pA0  = _mm_add_ps(pA0, CA0);
1160
1161     res_erf = _mm_mul_ps(x, pA0);
1162
1163     /* Calculate erfc */
1164     y       = gmx_mm_abs_ps(x);
1165     t       = gmx_mm_inv_ps(y);
1166     w       = _mm_sub_ps(t, one);
1167     t2      = _mm_mul_ps(t, t);
1168     w2      = _mm_mul_ps(w, w);
1169     /*
1170      * We cannot simply calculate exp(-x2) directly in single precision, since
1171      * that will lose a couple of bits of precision due to the multiplication.
1172      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1173      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1174      *
1175      * The only drawback with this is that it requires TWO separate exponential
1176      * evaluations, which would be horrible performance-wise. However, the argument
1177      * for the second exp() call is always small, so there we simply use a
1178      * low-order minimax expansion on [0,0.1].
1179      */
1180
1181     z       = _mm_and_ps(y, sieve);
1182     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
1183
1184     corr    = _mm_mul_ps(CD4, q);
1185     corr    = _mm_add_ps(corr, CD3);
1186     corr    = _mm_mul_ps(corr, q);
1187     corr    = _mm_add_ps(corr, CD2);
1188     corr    = _mm_mul_ps(corr, q);
1189     corr    = _mm_add_ps(corr, one);
1190     corr    = _mm_mul_ps(corr, q);
1191     corr    = _mm_add_ps(corr, one);
1192
1193     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
1194     expmx2  = _mm_mul_ps(expmx2, corr);
1195
1196     pB1  = _mm_mul_ps(CB9, w2);
1197     pB0  = _mm_mul_ps(CB8, w2);
1198     pB1  = _mm_add_ps(pB1, CB7);
1199     pB0  = _mm_add_ps(pB0, CB6);
1200     pB1  = _mm_mul_ps(pB1, w2);
1201     pB0  = _mm_mul_ps(pB0, w2);
1202     pB1  = _mm_add_ps(pB1, CB5);
1203     pB0  = _mm_add_ps(pB0, CB4);
1204     pB1  = _mm_mul_ps(pB1, w2);
1205     pB0  = _mm_mul_ps(pB0, w2);
1206     pB1  = _mm_add_ps(pB1, CB3);
1207     pB0  = _mm_add_ps(pB0, CB2);
1208     pB1  = _mm_mul_ps(pB1, w2);
1209     pB0  = _mm_mul_ps(pB0, w2);
1210     pB1  = _mm_add_ps(pB1, CB1);
1211     pB1  = _mm_mul_ps(pB1, w);
1212     pB0  = _mm_add_ps(pB0, pB1);
1213     pB0  = _mm_add_ps(pB0, CB0);
1214
1215     pC0  = _mm_mul_ps(CC10, t2);
1216     pC1  = _mm_mul_ps(CC9, t2);
1217     pC0  = _mm_add_ps(pC0, CC8);
1218     pC1  = _mm_add_ps(pC1, CC7);
1219     pC0  = _mm_mul_ps(pC0, t2);
1220     pC1  = _mm_mul_ps(pC1, t2);
1221     pC0  = _mm_add_ps(pC0, CC6);
1222     pC1  = _mm_add_ps(pC1, CC5);
1223     pC0  = _mm_mul_ps(pC0, t2);
1224     pC1  = _mm_mul_ps(pC1, t2);
1225     pC0  = _mm_add_ps(pC0, CC4);
1226     pC1  = _mm_add_ps(pC1, CC3);
1227     pC0  = _mm_mul_ps(pC0, t2);
1228     pC1  = _mm_mul_ps(pC1, t2);
1229     pC0  = _mm_add_ps(pC0, CC2);
1230     pC1  = _mm_add_ps(pC1, CC1);
1231     pC0  = _mm_mul_ps(pC0, t2);
1232     pC1  = _mm_mul_ps(pC1, t);
1233     pC0  = _mm_add_ps(pC0, pC1);
1234     pC0  = _mm_add_ps(pC0, CC0);
1235     pC0  = _mm_mul_ps(pC0, t);
1236
1237     /* SELECT pB0 or pC0 for erfc() */
1238     mask     = _mm_cmplt_ps(two, y);
1239     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
1240     res_erfc = _mm_mul_ps(res_erfc, expmx2);
1241
1242     /* erfc(x<0) = 2-erfc(|x|) */
1243     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
1244     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
1245
1246     /* Select erf() or erfc() */
1247     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
1248     res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
1249
1250     return res;
1251 }
1252
1253
1254
1255 /* Calculate the force correction due to PME analytically.
1256  *
1257  * This routine is meant to enable analytical evaluation of the
1258  * direct-space PME electrostatic force to avoid tables.
1259  *
1260  * The direct-space potential should be Erfc(beta*r)/r, but there
1261  * are some problems evaluating that:
1262  *
1263  * First, the error function is difficult (read: expensive) to
1264  * approxmiate accurately for intermediate to large arguments, and
1265  * this happens already in ranges of beta*r that occur in simulations.
1266  * Second, we now try to avoid calculating potentials in Gromacs but
1267  * use forces directly.
1268  *
1269  * We can simply things slight by noting that the PME part is really
1270  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1271  *
1272  * V= 1/r - Erf(beta*r)/r
1273  *
1274  * The first term we already have from the inverse square root, so
1275  * that we can leave out of this routine.
1276  *
1277  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1278  * the argument beta*r will be in the range 0.15 to ~4. Use your
1279  * favorite plotting program to realize how well-behaved Erf(z)/z is
1280  * in this range!
1281  *
1282  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1283  * However, it turns out it is more efficient to approximate f(z)/z and
1284  * then only use even powers. This is another minor optimization, since
1285  * we actually WANT f(z)/z, because it is going to be multiplied by
1286  * the vector between the two atoms to get the vectorial force. The
1287  * fastest flops are the ones we can avoid calculating!
1288  *
1289  * So, here's how it should be used:
1290  *
1291  * 1. Calculate r^2.
1292  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1293  * 3. Evaluate this routine with z^2 as the argument.
1294  * 4. The return value is the expression:
1295  *
1296  *
1297  *       2*exp(-z^2)     erf(z)
1298  *       ------------ - --------
1299  *       sqrt(Pi)*z^2      z^3
1300  *
1301  * 5. Multiply the entire expression by beta^3. This will get you
1302  *
1303  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
1304  *       ------------------  - ---------------
1305  *          sqrt(Pi)*z^2            z^3
1306  *
1307  *    or, switching back to r (z=r*beta):
1308  *
1309  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
1310  *       ----------------------- - -----------
1311  *            sqrt(Pi)*r^2            r^3
1312  *
1313  *
1314  *    With a bit of math exercise you should be able to confirm that
1315  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1316  *
1317  * 6. Add the result to 1/r^3, multiply by the product of the charges,
1318  *    and you have your force (divided by r). A final multiplication
1319  *    with the vector connecting the two particles and you have your
1320  *    vectorial force to add to the particles.
1321  *
1322  */
1323 static __m256
1324 gmx_mm256_pmecorrF_ps(__m256 z2)
1325 {
1326     const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
1327     const __m256  FN5      = _mm256_set1_ps(1.4703624142580877519e-6f);
1328     const __m256  FN4      = _mm256_set1_ps(-0.000053401640219807709149f);
1329     const __m256  FN3      = _mm256_set1_ps(0.0010054721316683106153f);
1330     const __m256  FN2      = _mm256_set1_ps(-0.019278317264888380590f);
1331     const __m256  FN1      = _mm256_set1_ps(0.069670166153766424023f);
1332     const __m256  FN0      = _mm256_set1_ps(-0.75225204789749321333f);
1333
1334     const __m256  FD4      = _mm256_set1_ps(0.0011193462567257629232f);
1335     const __m256  FD3      = _mm256_set1_ps(0.014866955030185295499f);
1336     const __m256  FD2      = _mm256_set1_ps(0.11583842382862377919f);
1337     const __m256  FD1      = _mm256_set1_ps(0.50736591960530292870f);
1338     const __m256  FD0      = _mm256_set1_ps(1.0f);
1339
1340     __m256        z4;
1341     __m256        polyFN0, polyFN1, polyFD0, polyFD1;
1342
1343     z4             = _mm256_mul_ps(z2, z2);
1344
1345     polyFD0        = _mm256_mul_ps(FD4, z4);
1346     polyFD1        = _mm256_mul_ps(FD3, z4);
1347     polyFD0        = _mm256_add_ps(polyFD0, FD2);
1348     polyFD1        = _mm256_add_ps(polyFD1, FD1);
1349     polyFD0        = _mm256_mul_ps(polyFD0, z4);
1350     polyFD1        = _mm256_mul_ps(polyFD1, z2);
1351     polyFD0        = _mm256_add_ps(polyFD0, FD0);
1352     polyFD0        = _mm256_add_ps(polyFD0, polyFD1);
1353
1354     polyFD0        = gmx_mm256_inv_ps(polyFD0);
1355
1356     polyFN0        = _mm256_mul_ps(FN6, z4);
1357     polyFN1        = _mm256_mul_ps(FN5, z4);
1358     polyFN0        = _mm256_add_ps(polyFN0, FN4);
1359     polyFN1        = _mm256_add_ps(polyFN1, FN3);
1360     polyFN0        = _mm256_mul_ps(polyFN0, z4);
1361     polyFN1        = _mm256_mul_ps(polyFN1, z4);
1362     polyFN0        = _mm256_add_ps(polyFN0, FN2);
1363     polyFN1        = _mm256_add_ps(polyFN1, FN1);
1364     polyFN0        = _mm256_mul_ps(polyFN0, z4);
1365     polyFN1        = _mm256_mul_ps(polyFN1, z2);
1366     polyFN0        = _mm256_add_ps(polyFN0, FN0);
1367     polyFN0        = _mm256_add_ps(polyFN0, polyFN1);
1368
1369     return _mm256_mul_ps(polyFN0, polyFD0);
1370 }
1371
1372
1373 static __m128
1374 gmx_mm_pmecorrF_ps(__m128 z2)
1375 {
1376     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
1377     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
1378     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
1379     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
1380     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
1381     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
1382     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
1383
1384     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
1385     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
1386     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
1387     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
1388     const __m128  FD0      = _mm_set1_ps(1.0f);
1389
1390     __m128        z4;
1391     __m128        polyFN0, polyFN1, polyFD0, polyFD1;
1392
1393     z4             = _mm_mul_ps(z2, z2);
1394
1395     polyFD0        = _mm_mul_ps(FD4, z4);
1396     polyFD1        = _mm_mul_ps(FD3, z4);
1397     polyFD0        = _mm_add_ps(polyFD0, FD2);
1398     polyFD1        = _mm_add_ps(polyFD1, FD1);
1399     polyFD0        = _mm_mul_ps(polyFD0, z4);
1400     polyFD1        = _mm_mul_ps(polyFD1, z2);
1401     polyFD0        = _mm_add_ps(polyFD0, FD0);
1402     polyFD0        = _mm_add_ps(polyFD0, polyFD1);
1403
1404     polyFD0        = gmx_mm_inv_ps(polyFD0);
1405
1406     polyFN0        = _mm_mul_ps(FN6, z4);
1407     polyFN1        = _mm_mul_ps(FN5, z4);
1408     polyFN0        = _mm_add_ps(polyFN0, FN4);
1409     polyFN1        = _mm_add_ps(polyFN1, FN3);
1410     polyFN0        = _mm_mul_ps(polyFN0, z4);
1411     polyFN1        = _mm_mul_ps(polyFN1, z4);
1412     polyFN0        = _mm_add_ps(polyFN0, FN2);
1413     polyFN1        = _mm_add_ps(polyFN1, FN1);
1414     polyFN0        = _mm_mul_ps(polyFN0, z4);
1415     polyFN1        = _mm_mul_ps(polyFN1, z2);
1416     polyFN0        = _mm_add_ps(polyFN0, FN0);
1417     polyFN0        = _mm_add_ps(polyFN0, polyFN1);
1418
1419     return _mm_mul_ps(polyFN0, polyFD0);
1420 }
1421
1422
1423
1424 /* Calculate the potential correction due to PME analytically.
1425  *
1426  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1427  *
1428  * This routine calculates Erf(z)/z, although you should provide z^2
1429  * as the input argument.
1430  *
1431  * Here's how it should be used:
1432  *
1433  * 1. Calculate r^2.
1434  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1435  * 3. Evaluate this routine with z^2 as the argument.
1436  * 4. The return value is the expression:
1437  *
1438  *
1439  *        erf(z)
1440  *       --------
1441  *          z
1442  *
1443  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1444  *
1445  *       erf(r*beta)
1446  *       -----------
1447  *           r
1448  *
1449  * 6. Subtract the result from 1/r, multiply by the product of the charges,
1450  *    and you have your potential.
1451  */
1452 static __m256
1453 gmx_mm256_pmecorrV_ps(__m256 z2)
1454 {
1455     const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
1456     const __m256  VN5      = _mm256_set1_ps(-1.4213390571557850962e-6f);
1457     const __m256  VN4      = _mm256_set1_ps(0.000041603292906656984871f);
1458     const __m256  VN3      = _mm256_set1_ps(-0.00013134036773265025626f);
1459     const __m256  VN2      = _mm256_set1_ps(0.038657983986041781264f);
1460     const __m256  VN1      = _mm256_set1_ps(0.11285044772717598220f);
1461     const __m256  VN0      = _mm256_set1_ps(1.1283802385263030286f);
1462
1463     const __m256  VD3      = _mm256_set1_ps(0.0066752224023576045451f);
1464     const __m256  VD2      = _mm256_set1_ps(0.078647795836373922256f);
1465     const __m256  VD1      = _mm256_set1_ps(0.43336185284710920150f);
1466     const __m256  VD0      = _mm256_set1_ps(1.0f);
1467
1468     __m256        z4;
1469     __m256        polyVN0, polyVN1, polyVD0, polyVD1;
1470
1471     z4             = _mm256_mul_ps(z2, z2);
1472
1473     polyVD1        = _mm256_mul_ps(VD3, z4);
1474     polyVD0        = _mm256_mul_ps(VD2, z4);
1475     polyVD1        = _mm256_add_ps(polyVD1, VD1);
1476     polyVD0        = _mm256_add_ps(polyVD0, VD0);
1477     polyVD1        = _mm256_mul_ps(polyVD1, z2);
1478     polyVD0        = _mm256_add_ps(polyVD0, polyVD1);
1479
1480     polyVD0        = gmx_mm256_inv_ps(polyVD0);
1481
1482     polyVN0        = _mm256_mul_ps(VN6, z4);
1483     polyVN1        = _mm256_mul_ps(VN5, z4);
1484     polyVN0        = _mm256_add_ps(polyVN0, VN4);
1485     polyVN1        = _mm256_add_ps(polyVN1, VN3);
1486     polyVN0        = _mm256_mul_ps(polyVN0, z4);
1487     polyVN1        = _mm256_mul_ps(polyVN1, z4);
1488     polyVN0        = _mm256_add_ps(polyVN0, VN2);
1489     polyVN1        = _mm256_add_ps(polyVN1, VN1);
1490     polyVN0        = _mm256_mul_ps(polyVN0, z4);
1491     polyVN1        = _mm256_mul_ps(polyVN1, z2);
1492     polyVN0        = _mm256_add_ps(polyVN0, VN0);
1493     polyVN0        = _mm256_add_ps(polyVN0, polyVN1);
1494
1495     return _mm256_mul_ps(polyVN0, polyVD0);
1496 }
1497
1498
1499 static __m128
1500 gmx_mm_pmecorrV_ps(__m128 z2)
1501 {
1502     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
1503     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
1504     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
1505     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
1506     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
1507     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
1508     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
1509
1510     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
1511     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
1512     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
1513     const __m128  VD0      = _mm_set1_ps(1.0f);
1514
1515     __m128        z4;
1516     __m128        polyVN0, polyVN1, polyVD0, polyVD1;
1517
1518     z4             = _mm_mul_ps(z2, z2);
1519
1520     polyVD1        = _mm_mul_ps(VD3, z4);
1521     polyVD0        = _mm_mul_ps(VD2, z4);
1522     polyVD1        = _mm_add_ps(polyVD1, VD1);
1523     polyVD0        = _mm_add_ps(polyVD0, VD0);
1524     polyVD1        = _mm_mul_ps(polyVD1, z2);
1525     polyVD0        = _mm_add_ps(polyVD0, polyVD1);
1526
1527     polyVD0        = gmx_mm_inv_ps(polyVD0);
1528
1529     polyVN0        = _mm_mul_ps(VN6, z4);
1530     polyVN1        = _mm_mul_ps(VN5, z4);
1531     polyVN0        = _mm_add_ps(polyVN0, VN4);
1532     polyVN1        = _mm_add_ps(polyVN1, VN3);
1533     polyVN0        = _mm_mul_ps(polyVN0, z4);
1534     polyVN1        = _mm_mul_ps(polyVN1, z4);
1535     polyVN0        = _mm_add_ps(polyVN0, VN2);
1536     polyVN1        = _mm_add_ps(polyVN1, VN1);
1537     polyVN0        = _mm_mul_ps(polyVN0, z4);
1538     polyVN1        = _mm_mul_ps(polyVN1, z2);
1539     polyVN0        = _mm_add_ps(polyVN0, VN0);
1540     polyVN0        = _mm_add_ps(polyVN0, polyVN1);
1541
1542     return _mm_mul_ps(polyVN0, polyVD0);
1543 }
1544
1545
1546 static int
1547 gmx_mm256_sincos_ps(__m256  x,
1548                     __m256 *sinval,
1549                     __m256 *cosval)
1550 {
1551     const __m256  two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1552     const __m256  half        = _mm256_set1_ps(0.5f);
1553     const __m256  one         = _mm256_set1_ps(1.0f);
1554     const __m256  zero        = _mm256_setzero_ps();
1555
1556     const __m128i ione       = _mm_set1_epi32(1);
1557
1558     const __m256  mask_one    = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1559     const __m256  mask_two    = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1560     const __m256  mask_three  = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1561
1562     const __m256  CA1         = _mm256_set1_ps(1.5703125f);
1563     const __m256  CA2         = _mm256_set1_ps(4.837512969970703125e-4f);
1564     const __m256  CA3         = _mm256_set1_ps(7.54978995489188216e-8f);
1565
1566     const __m256  CC0         = _mm256_set1_ps(-0.0013602249f);
1567     const __m256  CC1         = _mm256_set1_ps(0.0416566950f);
1568     const __m256  CC2         = _mm256_set1_ps(-0.4999990225f);
1569     const __m256  CS0         = _mm256_set1_ps(-0.0001950727f);
1570     const __m256  CS1         = _mm256_set1_ps(0.0083320758f);
1571     const __m256  CS2         = _mm256_set1_ps(-0.1666665247f);
1572
1573     const __m256  signbit    = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1574
1575     __m256        y, y2;
1576     __m256        z;
1577     __m256i       iz;
1578     __m128i       iz_high, iz_low;
1579     __m256        offset_sin, offset_cos;
1580     __m256        mask_sin, mask_cos;
1581     __m256        tmp1, tmp2;
1582     __m256        tmp_sin, tmp_cos;
1583
1584     y               = _mm256_mul_ps(x, two_over_pi);
1585     y               = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
1586
1587     iz              = _mm256_cvttps_epi32(y);
1588     z               = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
1589
1590     offset_sin      = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
1591
1592     iz_high         = _mm256_extractf128_si256(iz, 0x1);
1593     iz_low          = _mm256_castsi256_si128(iz);
1594     iz_low          = _mm_add_epi32(iz_low, ione);
1595     iz_high         = _mm_add_epi32(iz_high, ione);
1596     iz              = _mm256_castsi128_si256(iz_low);
1597     iz              = _mm256_insertf128_si256(iz, iz_high, 0x1);
1598     offset_cos      = _mm256_castsi256_ps(iz);
1599
1600     /* Extended precision arithmethic to achieve full precision */
1601     y               = _mm256_mul_ps(z, CA1);
1602     tmp1            = _mm256_mul_ps(z, CA2);
1603     tmp2            = _mm256_mul_ps(z, CA3);
1604     y               = _mm256_sub_ps(x, y);
1605     y               = _mm256_sub_ps(y, tmp1);
1606     y               = _mm256_sub_ps(y, tmp2);
1607
1608     y2              = _mm256_mul_ps(y, y);
1609
1610     tmp1            = _mm256_mul_ps(CC0, y2);
1611     tmp1            = _mm256_add_ps(tmp1, CC1);
1612     tmp2            = _mm256_mul_ps(CS0, y2);
1613     tmp2            = _mm256_add_ps(tmp2, CS1);
1614     tmp1            = _mm256_mul_ps(tmp1, y2);
1615     tmp1            = _mm256_add_ps(tmp1, CC2);
1616     tmp2            = _mm256_mul_ps(tmp2, y2);
1617     tmp2            = _mm256_add_ps(tmp2, CS2);
1618
1619     tmp1            = _mm256_mul_ps(tmp1, y2);
1620     tmp1            = _mm256_add_ps(tmp1, one);
1621
1622     tmp2            = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
1623     tmp2            = _mm256_add_ps(tmp2, y);
1624
1625 #ifdef __INTEL_COMPILER
1626     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1627     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
1628     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
1629 #else
1630     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
1631     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
1632 #endif
1633     tmp_sin         = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
1634     tmp_cos         = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
1635
1636     tmp1            = _mm256_xor_ps(signbit, tmp_sin);
1637     tmp2            = _mm256_xor_ps(signbit, tmp_cos);
1638
1639 #ifdef __INTEL_COMPILER
1640     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1641     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
1642     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
1643 #else
1644     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
1645     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
1646
1647 #endif
1648     *sinval         = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
1649     *cosval         = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
1650
1651     return 0;
1652 }
1653
1654 static int
1655 gmx_mm_sincos_ps(__m128  x,
1656                  __m128 *sinval,
1657                  __m128 *cosval)
1658 {
1659     const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
1660     const __m128  half        = _mm_set1_ps(0.5);
1661     const __m128  one         = _mm_set1_ps(1.0);
1662
1663     const __m128i izero      = _mm_set1_epi32(0);
1664     const __m128i ione       = _mm_set1_epi32(1);
1665     const __m128i itwo       = _mm_set1_epi32(2);
1666     const __m128i ithree     = _mm_set1_epi32(3);
1667     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1668
1669     const __m128  CA1         = _mm_set1_ps(1.5703125f);
1670     const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
1671     const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
1672
1673     const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
1674     const __m128  CC1         = _mm_set1_ps(0.0416566950f);
1675     const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
1676     const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
1677     const __m128  CS1         = _mm_set1_ps(0.0083320758f);
1678     const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
1679
1680     __m128        y, y2;
1681     __m128        z;
1682     __m128i       iz;
1683     __m128i       offset_sin, offset_cos;
1684     __m128        tmp1, tmp2;
1685     __m128        mask_sin, mask_cos;
1686     __m128        tmp_sin, tmp_cos;
1687
1688     y          = _mm_mul_ps(x, two_over_pi);
1689     y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
1690
1691     iz         = _mm_cvttps_epi32(y);
1692     z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
1693
1694     offset_sin = _mm_and_si128(iz, ithree);
1695     offset_cos = _mm_add_epi32(iz, ione);
1696
1697     /* Extended precision arithmethic to achieve full precision */
1698     y               = _mm_mul_ps(z, CA1);
1699     tmp1            = _mm_mul_ps(z, CA2);
1700     tmp2            = _mm_mul_ps(z, CA3);
1701     y               = _mm_sub_ps(x, y);
1702     y               = _mm_sub_ps(y, tmp1);
1703     y               = _mm_sub_ps(y, tmp2);
1704
1705     y2              = _mm_mul_ps(y, y);
1706
1707     tmp1            = _mm_mul_ps(CC0, y2);
1708     tmp1            = _mm_add_ps(tmp1, CC1);
1709     tmp2            = _mm_mul_ps(CS0, y2);
1710     tmp2            = _mm_add_ps(tmp2, CS1);
1711     tmp1            = _mm_mul_ps(tmp1, y2);
1712     tmp1            = _mm_add_ps(tmp1, CC2);
1713     tmp2            = _mm_mul_ps(tmp2, y2);
1714     tmp2            = _mm_add_ps(tmp2, CS2);
1715
1716     tmp1            = _mm_mul_ps(tmp1, y2);
1717     tmp1            = _mm_add_ps(tmp1, one);
1718
1719     tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
1720     tmp2            = _mm_add_ps(tmp2, y);
1721
1722     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
1723     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
1724
1725     tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
1726     tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
1727
1728     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
1729     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
1730
1731     tmp1            = _mm_xor_ps(signbit, tmp_sin);
1732     tmp2            = _mm_xor_ps(signbit, tmp_cos);
1733
1734     *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
1735     *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
1736
1737     return 0;
1738 }
1739
1740
1741
1742
1743 /*
1744  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1745  * will then call the sincos() routine and waste a factor 2 in performance!
1746  */
1747 static __m256
1748 gmx_mm256_sin_ps(__m256 x)
1749 {
1750     __m256 s, c;
1751     gmx_mm256_sincos_ps(x, &s, &c);
1752     return s;
1753 }
1754
1755 static __m128
1756 gmx_mm_sin_ps(__m128 x)
1757 {
1758     __m128 s, c;
1759     gmx_mm_sincos_ps(x, &s, &c);
1760     return s;
1761 }
1762
1763
1764 /*
1765  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1766  * will then call the sincos() routine and waste a factor 2 in performance!
1767  */
1768 static __m256
1769 gmx_mm256_cos_ps(__m256 x)
1770 {
1771     __m256 s, c;
1772     gmx_mm256_sincos_ps(x, &s, &c);
1773     return c;
1774 }
1775
1776 static __m128
1777 gmx_mm_cos_ps(__m128 x)
1778 {
1779     __m128 s, c;
1780     gmx_mm_sincos_ps(x, &s, &c);
1781     return c;
1782 }
1783
1784
1785 static __m256
1786 gmx_mm256_tan_ps(__m256 x)
1787 {
1788     __m256 sinval, cosval;
1789     __m256 tanval;
1790
1791     gmx_mm256_sincos_ps(x, &sinval, &cosval);
1792
1793     tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
1794
1795     return tanval;
1796 }
1797
1798 static __m128
1799 gmx_mm_tan_ps(__m128 x)
1800 {
1801     __m128 sinval, cosval;
1802     __m128 tanval;
1803
1804     gmx_mm_sincos_ps(x, &sinval, &cosval);
1805
1806     tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
1807
1808     return tanval;
1809 }
1810
1811
1812 static __m256
1813 gmx_mm256_asin_ps(__m256 x)
1814 {
1815     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1816     const __m256 limitlow  = _mm256_set1_ps(1e-4f);
1817     const __m256 half      = _mm256_set1_ps(0.5f);
1818     const __m256 one       = _mm256_set1_ps(1.0f);
1819     const __m256 halfpi    = _mm256_set1_ps((float)M_PI/2.0f);
1820
1821     const __m256 CC5        = _mm256_set1_ps(4.2163199048E-2f);
1822     const __m256 CC4        = _mm256_set1_ps(2.4181311049E-2f);
1823     const __m256 CC3        = _mm256_set1_ps(4.5470025998E-2f);
1824     const __m256 CC2        = _mm256_set1_ps(7.4953002686E-2f);
1825     const __m256 CC1        = _mm256_set1_ps(1.6666752422E-1f);
1826
1827     __m256       sign;
1828     __m256       mask;
1829     __m256       xabs;
1830     __m256       z, z1, z2, q, q1, q2;
1831     __m256       pA, pB;
1832
1833     sign  = _mm256_andnot_ps(signmask, x);
1834     xabs  = _mm256_and_ps(x, signmask);
1835
1836     mask  = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
1837
1838     z1    = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
1839     q1    = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
1840     q1    = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
1841
1842     q2    = xabs;
1843     z2    = _mm256_mul_ps(q2, q2);
1844
1845     z     = _mm256_blendv_ps(z2, z1, mask);
1846     q     = _mm256_blendv_ps(q2, q1, mask);
1847
1848     z2    = _mm256_mul_ps(z, z);
1849
1850     pA    = _mm256_mul_ps(CC5, z2);
1851     pB    = _mm256_mul_ps(CC4, z2);
1852
1853     pA    = _mm256_add_ps(pA, CC3);
1854     pB    = _mm256_add_ps(pB, CC2);
1855
1856     pA    = _mm256_mul_ps(pA, z2);
1857     pB    = _mm256_mul_ps(pB, z2);
1858
1859     pA    = _mm256_add_ps(pA, CC1);
1860     pA    = _mm256_mul_ps(pA, z);
1861
1862     z     = _mm256_add_ps(pA, pB);
1863     z     = _mm256_mul_ps(z, q);
1864     z     = _mm256_add_ps(z, q);
1865
1866     q2    = _mm256_sub_ps(halfpi, z);
1867     q2    = _mm256_sub_ps(q2, z);
1868
1869     z     = _mm256_blendv_ps(z, q2, mask);
1870
1871     mask  = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
1872     z     = _mm256_blendv_ps(xabs, z, mask);
1873
1874     z     = _mm256_xor_ps(z, sign);
1875
1876     return z;
1877 }
1878
1879 static __m128
1880 gmx_mm_asin_ps(__m128 x)
1881 {
1882     /* Same algorithm as cephes library */
1883     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1884     const __m128 limitlow  = _mm_set1_ps(1e-4f);
1885     const __m128 half      = _mm_set1_ps(0.5f);
1886     const __m128 one       = _mm_set1_ps(1.0f);
1887     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
1888
1889     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
1890     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
1891     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
1892     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
1893     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
1894
1895     __m128       sign;
1896     __m128       mask;
1897     __m128       xabs;
1898     __m128       z, z1, z2, q, q1, q2;
1899     __m128       pA, pB;
1900
1901     sign  = _mm_andnot_ps(signmask, x);
1902     xabs  = _mm_and_ps(x, signmask);
1903
1904     mask  = _mm_cmpgt_ps(xabs, half);
1905
1906     z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
1907     q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1908     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1909
1910     q2    = xabs;
1911     z2    = _mm_mul_ps(q2, q2);
1912
1913     z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1914     q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1915
1916     z2    = _mm_mul_ps(z, z);
1917
1918     pA    = _mm_mul_ps(CC5, z2);
1919     pB    = _mm_mul_ps(CC4, z2);
1920
1921     pA    = _mm_add_ps(pA, CC3);
1922     pB    = _mm_add_ps(pB, CC2);
1923
1924     pA    = _mm_mul_ps(pA, z2);
1925     pB    = _mm_mul_ps(pB, z2);
1926
1927     pA    = _mm_add_ps(pA, CC1);
1928     pA    = _mm_mul_ps(pA, z);
1929
1930     z     = _mm_add_ps(pA, pB);
1931     z     = _mm_mul_ps(z, q);
1932     z     = _mm_add_ps(z, q);
1933
1934     q2    = _mm_sub_ps(halfpi, z);
1935     q2    = _mm_sub_ps(q2, z);
1936
1937     z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1938
1939     mask  = _mm_cmpgt_ps(xabs, limitlow);
1940     z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1941
1942     z = _mm_xor_ps(z, sign);
1943
1944     return z;
1945 }
1946
1947
1948 static __m256
1949 gmx_mm256_acos_ps(__m256 x)
1950 {
1951     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1952     const __m256 one_ps    = _mm256_set1_ps(1.0f);
1953     const __m256 half_ps   = _mm256_set1_ps(0.5f);
1954     const __m256 pi_ps     = _mm256_set1_ps((float)M_PI);
1955     const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1956
1957     __m256       mask1;
1958     __m256       mask2;
1959     __m256       xabs;
1960     __m256       z, z1, z2, z3;
1961
1962     xabs  = _mm256_and_ps(x, signmask);
1963     mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
1964     mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
1965
1966     z     = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
1967     z     = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
1968     z     = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
1969
1970     z     = _mm256_blendv_ps(x, z, mask1);
1971     z     = gmx_mm256_asin_ps(z);
1972
1973     z2    = _mm256_add_ps(z, z);
1974     z1    = _mm256_sub_ps(pi_ps, z2);
1975     z3    = _mm256_sub_ps(halfpi_ps, z);
1976
1977     z     = _mm256_blendv_ps(z1, z2, mask2);
1978     z     = _mm256_blendv_ps(z3, z, mask1);
1979
1980     return z;
1981 }
1982
1983 static __m128
1984 gmx_mm_acos_ps(__m128 x)
1985 {
1986     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1987     const __m128 one_ps    = _mm_set1_ps(1.0f);
1988     const __m128 half_ps   = _mm_set1_ps(0.5f);
1989     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1990     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1991
1992     __m128       mask1;
1993     __m128       mask2;
1994     __m128       xabs;
1995     __m128       z, z1, z2, z3;
1996
1997     xabs  = _mm_and_ps(x, signmask);
1998     mask1 = _mm_cmpgt_ps(xabs, half_ps);
1999     mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
2000
2001     z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
2002     z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
2003     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
2004
2005     z     = _mm_blendv_ps(x, z, mask1);
2006     z     = gmx_mm_asin_ps(z);
2007
2008     z2    = _mm_add_ps(z, z);
2009     z1    = _mm_sub_ps(pi_ps, z2);
2010     z3    = _mm_sub_ps(halfpi_ps, z);
2011
2012     z     = _mm_blendv_ps(z1, z2, mask2);
2013     z     = _mm_blendv_ps(z3, z, mask1);
2014
2015     return z;
2016 }
2017
2018
2019 static __m256
2020 gmx_mm256_atan_ps(__m256 x)
2021 {
2022     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2023     const __m256 limit1    = _mm256_set1_ps(0.414213562373095f);
2024     const __m256 limit2    = _mm256_set1_ps(2.414213562373095f);
2025     const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2026     const __m256 halfpi    = _mm256_set1_ps(1.570796326794896f);
2027     const __m256 mone      = _mm256_set1_ps(-1.0f);
2028     const __m256 CC3       = _mm256_set1_ps(-3.33329491539E-1f);
2029     const __m256 CC5       = _mm256_set1_ps(1.99777106478E-1f);
2030     const __m256 CC7       = _mm256_set1_ps(-1.38776856032E-1);
2031     const __m256 CC9       = _mm256_set1_ps(8.05374449538e-2f);
2032
2033     __m256       sign;
2034     __m256       mask1, mask2;
2035     __m256       y, z1, z2;
2036     __m256       x2, x4;
2037     __m256       sum1, sum2;
2038
2039     sign  = _mm256_andnot_ps(signmask, x);
2040     x     = _mm256_and_ps(x, signmask);
2041
2042     mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
2043     mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
2044
2045     z1    = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
2046     z2    = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
2047
2048     y     = _mm256_and_ps(mask1, quarterpi);
2049     y     = _mm256_blendv_ps(y, halfpi, mask2);
2050
2051     x     = _mm256_blendv_ps(x, z1, mask1);
2052     x     = _mm256_blendv_ps(x, z2, mask2);
2053
2054     x2    = _mm256_mul_ps(x, x);
2055     x4    = _mm256_mul_ps(x2, x2);
2056
2057     sum1  = _mm256_mul_ps(CC9, x4);
2058     sum2  = _mm256_mul_ps(CC7, x4);
2059     sum1  = _mm256_add_ps(sum1, CC5);
2060     sum2  = _mm256_add_ps(sum2, CC3);
2061     sum1  = _mm256_mul_ps(sum1, x4);
2062     sum2  = _mm256_mul_ps(sum2, x2);
2063
2064     sum1  = _mm256_add_ps(sum1, sum2);
2065     sum1  = _mm256_sub_ps(sum1, mone);
2066     sum1  = _mm256_mul_ps(sum1, x);
2067     y     = _mm256_add_ps(y, sum1);
2068
2069     y     = _mm256_xor_ps(y, sign);
2070
2071     return y;
2072 }
2073
2074 static __m128
2075 gmx_mm_atan_ps(__m128 x)
2076 {
2077     /* Same algorithm as cephes library */
2078     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2079     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
2080     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
2081     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2082     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
2083     const __m128 mone      = _mm_set1_ps(-1.0f);
2084     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
2085     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
2086     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
2087     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
2088
2089     __m128       sign;
2090     __m128       mask1, mask2;
2091     __m128       y, z1, z2;
2092     __m128       x2, x4;
2093     __m128       sum1, sum2;
2094
2095     sign  = _mm_andnot_ps(signmask, x);
2096     x     = _mm_and_ps(x, signmask);
2097
2098     mask1 = _mm_cmpgt_ps(x, limit1);
2099     mask2 = _mm_cmpgt_ps(x, limit2);
2100
2101     z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
2102     z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
2103
2104     y     = _mm_and_ps(mask1, quarterpi);
2105     y     = _mm_blendv_ps(y, halfpi, mask2);
2106
2107     x     = _mm_blendv_ps(x, z1, mask1);
2108     x     = _mm_blendv_ps(x, z2, mask2);
2109
2110     x2    = _mm_mul_ps(x, x);
2111     x4    = _mm_mul_ps(x2, x2);
2112
2113     sum1  = _mm_mul_ps(CC9, x4);
2114     sum2  = _mm_mul_ps(CC7, x4);
2115     sum1  = _mm_add_ps(sum1, CC5);
2116     sum2  = _mm_add_ps(sum2, CC3);
2117     sum1  = _mm_mul_ps(sum1, x4);
2118     sum2  = _mm_mul_ps(sum2, x2);
2119
2120     sum1  = _mm_add_ps(sum1, sum2);
2121     sum1  = _mm_sub_ps(sum1, mone);
2122     sum1  = _mm_mul_ps(sum1, x);
2123     y     = _mm_add_ps(y, sum1);
2124
2125     y     = _mm_xor_ps(y, sign);
2126
2127     return y;
2128 }
2129
2130
2131 static __m256
2132 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2133 {
2134     const __m256 pi          = _mm256_set1_ps( (float) M_PI);
2135     const __m256 minuspi     = _mm256_set1_ps( (float) -M_PI);
2136     const __m256 halfpi      = _mm256_set1_ps( (float) M_PI/2.0f);
2137     const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2138
2139     __m256       z, z1, z3, z4;
2140     __m256       w;
2141     __m256       maskx_lt, maskx_eq;
2142     __m256       masky_lt, masky_eq;
2143     __m256       mask1, mask2, mask3, mask4, maskall;
2144
2145     maskx_lt  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
2146     masky_lt  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
2147     maskx_eq  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
2148     masky_eq  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
2149
2150     z         = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
2151     z         = gmx_mm256_atan_ps(z);
2152
2153     mask1     = _mm256_and_ps(maskx_eq, masky_lt);
2154     mask2     = _mm256_andnot_ps(maskx_lt, masky_eq);
2155     mask3     = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
2156     mask4     = _mm256_and_ps(maskx_lt, masky_eq);
2157     maskall   = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
2158
2159     z         = _mm256_andnot_ps(maskall, z);
2160     z1        = _mm256_and_ps(mask1, minushalfpi);
2161     z3        = _mm256_and_ps(mask3, halfpi);
2162     z4        = _mm256_and_ps(mask4, pi);
2163
2164     z         = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
2165
2166     w         = _mm256_blendv_ps(pi, minuspi, masky_lt);
2167     w         = _mm256_and_ps(w, maskx_lt);
2168
2169     w         = _mm256_andnot_ps(maskall, w);
2170
2171     z         = _mm256_add_ps(z, w);
2172
2173     return z;
2174 }
2175
2176 static __m128
2177 gmx_mm_atan2_ps(__m128 y, __m128 x)
2178 {
2179     const __m128 pi          = _mm_set1_ps(M_PI);
2180     const __m128 minuspi     = _mm_set1_ps(-M_PI);
2181     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
2182     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2183
2184     __m128       z, z1, z3, z4;
2185     __m128       w;
2186     __m128       maskx_lt, maskx_eq;
2187     __m128       masky_lt, masky_eq;
2188     __m128       mask1, mask2, mask3, mask4, maskall;
2189
2190     maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
2191     masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
2192     maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
2193     masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
2194
2195     z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
2196     z         = gmx_mm_atan_ps(z);
2197
2198     mask1     = _mm_and_ps(maskx_eq, masky_lt);
2199     mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
2200     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
2201     mask4     = _mm_and_ps(masky_eq, maskx_lt);
2202
2203     maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
2204
2205     z         = _mm_andnot_ps(maskall, z);
2206     z1        = _mm_and_ps(mask1, minushalfpi);
2207     z3        = _mm_and_ps(mask3, halfpi);
2208     z4        = _mm_and_ps(mask4, pi);
2209
2210     z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
2211
2212     mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
2213     mask2     = _mm_and_ps(maskx_lt, masky_lt);
2214
2215     w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
2216     w         = _mm_andnot_ps(maskall, w);
2217
2218     z         = _mm_add_ps(z, w);
2219
2220     return z;
2221 }
2222
2223 #endif