src/gromacs/legacyheaders/gmx_math_x86_avx_256_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_math_x86_avx_256_single_h_
  22 #define _gmx_math_x86_avx_256_single_h_
  23
  24 #include <math.h>
  25
  26 #include "gmx_x86_avx_256.h"
  27
  28 #ifndef M_PI
  29 #  define M_PI 3.14159265358979323846264338327950288
  30 #endif
  31
  32
  33
  34 /************************
  35  *                      *
  36  * Simple math routines *
  37  *                      *
  38  ************************/
  39
  40 /* 1.0/sqrt(x), 256-bit wide version */
  41 static gmx_inline __m256
  42 gmx_mm256_invsqrt_ps(__m256 x)
  43 {
  44     const __m256 half  = _mm256_set1_ps(0.5f);
  45     const __m256 three = _mm256_set1_ps(3.0f);
  46
  47     __m256       lu = _mm256_rsqrt_ps(x);
  48
  49     return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
  50 }
  51
  52 /* 1.0/sqrt(x), 128-bit wide version */
  53 static gmx_inline __m128
  54 gmx_mm_invsqrt_ps(__m128 x)
  55 {
  56     const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
  57     const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
  58
  59     __m128       lu = _mm_rsqrt_ps(x);
  60
  61     return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
  62 }
  63
  64
  65 /* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  66 static gmx_inline __m256
  67 gmx_mm256_sqrt_ps(__m256 x)
  68 {
  69     __m256 mask;
  70     __m256 res;
  71
  72     mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
  73     res  = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
  74
  75     res  = _mm256_mul_ps(x, res);
  76
  77     return res;
  78 }
  79
  80 /* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
  81 static gmx_inline __m128
  82 gmx_mm_sqrt_ps(__m128 x)
  83 {
  84     __m128 mask;
  85     __m128 res;
  86
  87     mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
  88     res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
  89
  90     res  = _mm_mul_ps(x, res);
  91
  92     return res;
  93 }
  94
  95
  96 /* 1.0/x, 256-bit wide */
  97 static gmx_inline __m256
  98 gmx_mm256_inv_ps(__m256 x)
  99 {
 100     const __m256 two = _mm256_set1_ps(2.0f);
 101
 102     __m256       lu = _mm256_rcp_ps(x);
 103
 104     return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
 105 }
 106
 107 /* 1.0/x, 128-bit wide */
 108 static gmx_inline __m128
 109 gmx_mm_inv_ps(__m128 x)
 110 {
 111     const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
 112
 113     __m128       lu = _mm_rcp_ps(x);
 114
 115     return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
 116 }
 117
 118
 119 static gmx_inline __m256
 120 gmx_mm256_abs_ps(__m256 x)
 121 {
 122     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
 123
 124     return _mm256_and_ps(x, signmask);
 125 }
 126
 127 static gmx_inline __m128
 128 gmx_mm_abs_ps(__m128 x)
 129 {
 130     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 131
 132     return _mm_and_ps(x, signmask);
 133 }
 134
 135
 136 static __m256
 137 gmx_mm256_log_ps(__m256 x)
 138 {
 139     const __m256  expmask    = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
 140     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 141     const __m256  half       = _mm256_set1_ps(0.5f);
 142     const __m256  one        = _mm256_set1_ps(1.0f);
 143     const __m256  invsq2     = _mm256_set1_ps(1.0f/sqrt(2.0f));
 144     const __m256  corr1      = _mm256_set1_ps(-2.12194440e-4f);
 145     const __m256  corr2      = _mm256_set1_ps(0.693359375f);
 146
 147     const __m256  CA_1        = _mm256_set1_ps(0.070376836292f);
 148     const __m256  CB_0        = _mm256_set1_ps(1.6714950086782716f);
 149     const __m256  CB_1        = _mm256_set1_ps(-2.452088066061482f);
 150     const __m256  CC_0        = _mm256_set1_ps(1.5220770854701728f);
 151     const __m256  CC_1        = _mm256_set1_ps(-1.3422238433233642f);
 152     const __m256  CD_0        = _mm256_set1_ps(1.386218787509749f);
 153     const __m256  CD_1        = _mm256_set1_ps(0.35075468953796346f);
 154     const __m256  CE_0        = _mm256_set1_ps(1.3429983063133937f);
 155     const __m256  CE_1        = _mm256_set1_ps(1.807420826584643f);
 156
 157     __m256        fexp;
 158     __m256i       iexp;
 159     __m128i       iexp128a, iexp128b;
 160     __m256        mask;
 161     __m256i       imask;
 162     __m128i       imask128a, imask128b;
 163     __m256        x2;
 164     __m256        y;
 165     __m256        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 166
 167     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 168     fexp  = _mm256_and_ps(x, expmask);
 169     iexp  = _mm256_castps_si256(fexp);
 170
 171     iexp128b = _mm256_extractf128_si256(iexp, 0x1);
 172     iexp128a = _mm256_castsi256_si128(iexp);
 173
 174     iexp128a  = _mm_srli_epi32(iexp128a, 23);
 175     iexp128b  = _mm_srli_epi32(iexp128b, 23);
 176     iexp128a  = _mm_sub_epi32(iexp128a, expbase_m1);
 177     iexp128b  = _mm_sub_epi32(iexp128b, expbase_m1);
 178
 179     x     = _mm256_andnot_ps(expmask, x);
 180     x     = _mm256_or_ps(x, one);
 181     x     = _mm256_mul_ps(x, half);
 182
 183     mask  = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
 184
 185     x     = _mm256_add_ps(x, _mm256_and_ps(mask, x));
 186     x     = _mm256_sub_ps(x, one);
 187
 188     imask = _mm256_castps_si256(mask);
 189
 190     imask128b = _mm256_extractf128_si256(imask, 0x1);
 191     imask128a = _mm256_castsi256_si128(imask);
 192
 193     iexp128a  = _mm_add_epi32(iexp128a, imask128a);
 194     iexp128b  = _mm_add_epi32(iexp128b, imask128b);
 195
 196     iexp  = _mm256_castsi128_si256(iexp128a);
 197     iexp  = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
 198
 199     x2    = _mm256_mul_ps(x, x);
 200
 201     pA    = _mm256_mul_ps(CA_1, x);
 202     pB    = _mm256_mul_ps(CB_1, x);
 203     pC    = _mm256_mul_ps(CC_1, x);
 204     pD    = _mm256_mul_ps(CD_1, x);
 205     pE    = _mm256_mul_ps(CE_1, x);
 206     tB    = _mm256_add_ps(CB_0, x2);
 207     tC    = _mm256_add_ps(CC_0, x2);
 208     tD    = _mm256_add_ps(CD_0, x2);
 209     tE    = _mm256_add_ps(CE_0, x2);
 210     pB    = _mm256_add_ps(pB, tB);
 211     pC    = _mm256_add_ps(pC, tC);
 212     pD    = _mm256_add_ps(pD, tD);
 213     pE    = _mm256_add_ps(pE, tE);
 214
 215     pA    = _mm256_mul_ps(pA, pB);
 216     pC    = _mm256_mul_ps(pC, pD);
 217     pE    = _mm256_mul_ps(pE, x2);
 218     pA    = _mm256_mul_ps(pA, pC);
 219     y     = _mm256_mul_ps(pA, pE);
 220
 221     fexp  = _mm256_cvtepi32_ps(iexp);
 222     y     = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
 223
 224     y     = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
 225     x2    = _mm256_add_ps(x, y);
 226
 227     x2    = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
 228
 229     return x2;
 230 }
 231
 232
 233 static __m128
 234 gmx_mm_log_ps(__m128 x)
 235 {
 236     /* Same algorithm as cephes library */
 237     const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 238     const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 239     const __m128  half       = _mm_set1_ps(0.5f);
 240     const __m128  one        = _mm_set1_ps(1.0f);
 241     const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 242     const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 243     const __m128  corr2      = _mm_set1_ps(0.693359375f);
 244
 245     const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 246     const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 247     const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 248     const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 249     const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 250     const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 251     const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 252     const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 253     const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 254
 255     __m128        fexp;
 256     __m128i       iexp;
 257     __m128        mask;
 258     __m128        x2;
 259     __m128        y;
 260     __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 261
 262     /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 263     fexp  = _mm_and_ps(x, expmask);
 264     iexp  = gmx_mm_castps_si128(fexp);
 265     iexp  = _mm_srli_epi32(iexp, 23);
 266     iexp  = _mm_sub_epi32(iexp, expbase_m1);
 267
 268     x     = _mm_andnot_ps(expmask, x);
 269     x     = _mm_or_ps(x, one);
 270     x     = _mm_mul_ps(x, half);
 271
 272     mask  = _mm_cmplt_ps(x, invsq2);
 273
 274     x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 275     x     = _mm_sub_ps(x, one);
 276     iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 277
 278     x2    = _mm_mul_ps(x, x);
 279
 280     pA    = _mm_mul_ps(CA_1, x);
 281     pB    = _mm_mul_ps(CB_1, x);
 282     pC    = _mm_mul_ps(CC_1, x);
 283     pD    = _mm_mul_ps(CD_1, x);
 284     pE    = _mm_mul_ps(CE_1, x);
 285     tB    = _mm_add_ps(CB_0, x2);
 286     tC    = _mm_add_ps(CC_0, x2);
 287     tD    = _mm_add_ps(CD_0, x2);
 288     tE    = _mm_add_ps(CE_0, x2);
 289     pB    = _mm_add_ps(pB, tB);
 290     pC    = _mm_add_ps(pC, tC);
 291     pD    = _mm_add_ps(pD, tD);
 292     pE    = _mm_add_ps(pE, tE);
 293
 294     pA    = _mm_mul_ps(pA, pB);
 295     pC    = _mm_mul_ps(pC, pD);
 296     pE    = _mm_mul_ps(pE, x2);
 297     pA    = _mm_mul_ps(pA, pC);
 298     y     = _mm_mul_ps(pA, pE);
 299
 300     fexp  = _mm_cvtepi32_ps(iexp);
 301     y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
 302
 303     y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
 304     x2    = _mm_add_ps(x, y);
 305
 306     x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
 307
 308     return x2;
 309 }
 310
 311
 312 /*
 313  * 2^x function, 256-bit wide
 314  *
 315  * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 316  * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 317  *
 318  * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 319  *
 320  * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 321  * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 322  * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 323  * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 324  * number instead. That would take a few extra cycles and not really help, since something is
 325  * wrong if you are using single precision to work with numbers that cannot really be represented
 326  * in single precision.
 327  *
 328  * The accuracy is at least 23 bits.
 329  */
 330 static __m256
 331 gmx_mm256_exp2_ps(__m256 x)
 332 {
 333     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 334     const __m256  arglimit = _mm256_set1_ps(126.0f);
 335
 336     const __m128i expbase  = _mm_set1_epi32(127);
 337     const __m256  CC6      = _mm256_set1_ps(1.535336188319500E-004);
 338     const __m256  CC5      = _mm256_set1_ps(1.339887440266574E-003);
 339     const __m256  CC4      = _mm256_set1_ps(9.618437357674640E-003);
 340     const __m256  CC3      = _mm256_set1_ps(5.550332471162809E-002);
 341     const __m256  CC2      = _mm256_set1_ps(2.402264791363012E-001);
 342     const __m256  CC1      = _mm256_set1_ps(6.931472028550421E-001);
 343     const __m256  CC0      = _mm256_set1_ps(1.0f);
 344
 345     __m256        p0, p1;
 346     __m256        valuemask;
 347     __m256i       iexppart;
 348     __m128i       iexppart128a, iexppart128b;
 349     __m256        fexppart;
 350     __m256        intpart;
 351     __m256        x2;
 352
 353
 354     iexppart  = _mm256_cvtps_epi32(x);
 355     intpart   = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 356
 357     iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
 358     iexppart128a = _mm256_castsi256_si128(iexppart);
 359
 360     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
 361     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
 362
 363     iexppart  = _mm256_castsi128_si256(iexppart128a);
 364     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 365     valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
 366     fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
 367
 368     x         = _mm256_sub_ps(x, intpart);
 369     x2        = _mm256_mul_ps(x, x);
 370
 371     p0        = _mm256_mul_ps(CC6, x2);
 372     p1        = _mm256_mul_ps(CC5, x2);
 373     p0        = _mm256_add_ps(p0, CC4);
 374     p1        = _mm256_add_ps(p1, CC3);
 375     p0        = _mm256_mul_ps(p0, x2);
 376     p1        = _mm256_mul_ps(p1, x2);
 377     p0        = _mm256_add_ps(p0, CC2);
 378     p1        = _mm256_add_ps(p1, CC1);
 379     p0        = _mm256_mul_ps(p0, x2);
 380     p1        = _mm256_mul_ps(p1, x);
 381     p0        = _mm256_add_ps(p0, CC0);
 382     p0        = _mm256_add_ps(p0, p1);
 383     x         = _mm256_mul_ps(p0, fexppart);
 384
 385     return x;
 386 }
 387
 388
 389 /* 2^x, 128 bit wide */
 390 static __m128
 391 gmx_mm_exp2_ps(__m128 x)
 392 {
 393     /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 394     const __m128  arglimit = _mm_set1_ps(126.0f);
 395
 396     const __m128i expbase  = _mm_set1_epi32(127);
 397     const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 398     const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 399     const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 400     const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 401     const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 402     const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 403     const __m128  CA0      = _mm_set1_ps(1.0f);
 404
 405     __m128        valuemask;
 406     __m128i       iexppart;
 407     __m128        fexppart;
 408     __m128        intpart;
 409     __m128        x2;
 410     __m128        p0, p1;
 411
 412     iexppart  = _mm_cvtps_epi32(x);
 413     intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 414     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 415     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
 416     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 417
 418     x         = _mm_sub_ps(x, intpart);
 419     x2        = _mm_mul_ps(x, x);
 420
 421     p0        = _mm_mul_ps(CA6, x2);
 422     p1        = _mm_mul_ps(CA5, x2);
 423     p0        = _mm_add_ps(p0, CA4);
 424     p1        = _mm_add_ps(p1, CA3);
 425     p0        = _mm_mul_ps(p0, x2);
 426     p1        = _mm_mul_ps(p1, x2);
 427     p0        = _mm_add_ps(p0, CA2);
 428     p1        = _mm_add_ps(p1, CA1);
 429     p0        = _mm_mul_ps(p0, x2);
 430     p1        = _mm_mul_ps(p1, x);
 431     p0        = _mm_add_ps(p0, CA0);
 432     p0        = _mm_add_ps(p0, p1);
 433     x         = _mm_mul_ps(p0, fexppart);
 434
 435     return x;
 436 }
 437
 438
 439 /* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
 440  * where y=log2(e)*x, but there will then be a small rounding error since we lose some
 441  * precision due to the multiplication. This will then be magnified a lot by the exponential.
 442  *
 443  * Instead, we calculate the fractional part directly as a minimax approximation of
 444  * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 445  * remaining after 2^y, which avoids the precision-loss.
 446  * The final result is correct to within 1 LSB over the entire argument range.
 447  */
 448 static __m256
 449 gmx_mm256_exp_ps(__m256 exparg)
 450 {
 451     const __m256  argscale      = _mm256_set1_ps(1.44269504088896341f);
 452     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 453     const __m256  arglimit      = _mm256_set1_ps(126.0f);
 454     const __m128i expbase       = _mm_set1_epi32(127);
 455
 456     const __m256  invargscale0  = _mm256_set1_ps(0.693359375f);
 457     const __m256  invargscale1  = _mm256_set1_ps(-2.12194440e-4f);
 458
 459     const __m256  CE5           = _mm256_set1_ps(1.9875691500e-4f);
 460     const __m256  CE4           = _mm256_set1_ps(1.3981999507e-3f);
 461     const __m256  CE3           = _mm256_set1_ps(8.3334519073e-3f);
 462     const __m256  CE2           = _mm256_set1_ps(4.1665795894e-2f);
 463     const __m256  CE1           = _mm256_set1_ps(1.6666665459e-1f);
 464     const __m256  CE0           = _mm256_set1_ps(5.0000001201e-1f);
 465     const __m256  one           = _mm256_set1_ps(1.0f);
 466
 467     __m256        exparg2, exp2arg;
 468     __m256        pE0, pE1;
 469     __m256        valuemask;
 470     __m256i       iexppart;
 471     __m128i       iexppart128a, iexppart128b;
 472     __m256        fexppart;
 473     __m256        intpart;
 474
 475     exp2arg = _mm256_mul_ps(exparg, argscale);
 476
 477     iexppart  = _mm256_cvtps_epi32(exp2arg);
 478     intpart   = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
 479
 480     iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
 481     iexppart128a = _mm256_castsi256_si128(iexppart);
 482
 483     iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
 484     iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
 485
 486     iexppart  = _mm256_castsi128_si256(iexppart128a);
 487     iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
 488     valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
 489     fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
 490
 491     /* Extended precision arithmetics */
 492     exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
 493     exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
 494
 495     exparg2   = _mm256_mul_ps(exparg, exparg);
 496
 497     pE1       = _mm256_mul_ps(CE5, exparg2);
 498     pE0       = _mm256_mul_ps(CE4, exparg2);
 499     pE1       = _mm256_add_ps(pE1, CE3);
 500     pE0       = _mm256_add_ps(pE0, CE2);
 501     pE1       = _mm256_mul_ps(pE1, exparg2);
 502     pE0       = _mm256_mul_ps(pE0, exparg2);
 503     pE1       = _mm256_add_ps(pE1, CE1);
 504     pE0       = _mm256_add_ps(pE0, CE0);
 505     pE1       = _mm256_mul_ps(pE1, exparg);
 506     pE0       = _mm256_add_ps(pE0, pE1);
 507     pE0       = _mm256_mul_ps(pE0, exparg2);
 508     exparg    = _mm256_add_ps(exparg, one);
 509     exparg    = _mm256_add_ps(exparg, pE0);
 510
 511     exparg    = _mm256_mul_ps(exparg, fexppart);
 512
 513     return exparg;
 514 }
 515
 516
 517 /* exp(), 128 bit wide. */
 518 static __m128
 519 gmx_mm_exp_ps(__m128 x)
 520 {
 521     const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 522     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 523     const __m128  arglimit      = _mm_set1_ps(126.0f);
 524     const __m128i expbase       = _mm_set1_epi32(127);
 525
 526     const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 527     const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 528
 529     const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 530     const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 531     const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 532     const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 533     const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 534     const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 535     const __m128  one           = _mm_set1_ps(1.0f);
 536
 537     __m128        y, x2;
 538     __m128        p0, p1;
 539     __m128        valuemask;
 540     __m128i       iexppart;
 541     __m128        fexppart;
 542     __m128        intpart;
 543
 544     y = _mm_mul_ps(x, argscale);
 545
 546     iexppart  = _mm_cvtps_epi32(y);
 547     intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
 548
 549     iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 550     valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
 551     fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 552
 553     /* Extended precision arithmetics */
 554     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
 555     x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
 556
 557     x2        = _mm_mul_ps(x, x);
 558
 559     p1        = _mm_mul_ps(CC5, x2);
 560     p0        = _mm_mul_ps(CC4, x2);
 561     p1        = _mm_add_ps(p1, CC3);
 562     p0        = _mm_add_ps(p0, CC2);
 563     p1        = _mm_mul_ps(p1, x2);
 564     p0        = _mm_mul_ps(p0, x2);
 565     p1        = _mm_add_ps(p1, CC1);
 566     p0        = _mm_add_ps(p0, CC0);
 567     p1        = _mm_mul_ps(p1, x);
 568     p0        = _mm_add_ps(p0, p1);
 569     p0        = _mm_mul_ps(p0, x2);
 570     x         = _mm_add_ps(x, one);
 571     x         = _mm_add_ps(x, p0);
 572
 573     x         = _mm_mul_ps(x, fexppart);
 574
 575     return x;
 576 }
 577
 578
 579
 580 /* FULL precision erf(), 256-bit wide. Only errors in LSB */
 581 static __m256
 582 gmx_mm256_erf_ps(__m256 x)
 583 {
 584     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 585     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 586     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 587     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 588     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 589     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 590     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 591     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 592     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 593     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 594     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 595     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 596     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 597     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 598     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 599     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 600     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 601     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 602     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 603     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 604     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 605     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 606     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 607     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 608     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 609     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 610     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 611     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 612     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 613     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 614     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 615
 616     /* Coefficients for expansion of exp(x) in [0,0.1] */
 617     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 618     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 619     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 620     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 621
 622     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 623     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 624     const __m256  one      = _mm256_set1_ps(1.0f);
 625     const __m256  two      = _mm256_set1_ps(2.0f);
 626
 627     __m256        x2, x4, y;
 628     __m256        z, q, t, t2, w, w2;
 629     __m256        pA0, pA1, pB0, pB1, pC0, pC1;
 630     __m256        expmx2, corr;
 631     __m256        res_erf, res_erfc, res;
 632     __m256        mask;
 633
 634     /* Calculate erf() */
 635     x2     = _mm256_mul_ps(x, x);
 636     x4     = _mm256_mul_ps(x2, x2);
 637
 638     pA0  = _mm256_mul_ps(CA6, x4);
 639     pA1  = _mm256_mul_ps(CA5, x4);
 640     pA0  = _mm256_add_ps(pA0, CA4);
 641     pA1  = _mm256_add_ps(pA1, CA3);
 642     pA0  = _mm256_mul_ps(pA0, x4);
 643     pA1  = _mm256_mul_ps(pA1, x4);
 644     pA0  = _mm256_add_ps(pA0, CA2);
 645     pA1  = _mm256_add_ps(pA1, CA1);
 646     pA0  = _mm256_mul_ps(pA0, x4);
 647     pA1  = _mm256_mul_ps(pA1, x2);
 648     pA0  = _mm256_add_ps(pA0, pA1);
 649     pA0  = _mm256_add_ps(pA0, CA0);
 650
 651     res_erf = _mm256_mul_ps(x, pA0);
 652
 653     /* Calculate erfc */
 654
 655     y       = gmx_mm256_abs_ps(x);
 656     t       = gmx_mm256_inv_ps(y);
 657     w       = _mm256_sub_ps(t, one);
 658     t2      = _mm256_mul_ps(t, t);
 659     w2      = _mm256_mul_ps(w, w);
 660     /*
 661      * We cannot simply calculate exp(-x2) directly in single precision, since
 662      * that will lose a couple of bits of precision due to the multiplication.
 663      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 664      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 665      *
 666      * The only drawback with this is that it requires TWO separate exponential
 667      * evaluations, which would be horrible performance-wise. However, the argument
 668      * for the second exp() call is always small, so there we simply use a
 669      * low-order minimax expansion on [0,0.1].
 670      */
 671
 672     z       = _mm256_and_ps(y, sieve);
 673     q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
 674
 675     corr    = _mm256_mul_ps(CD4, q);
 676     corr    = _mm256_add_ps(corr, CD3);
 677     corr    = _mm256_mul_ps(corr, q);
 678     corr    = _mm256_add_ps(corr, CD2);
 679     corr    = _mm256_mul_ps(corr, q);
 680     corr    = _mm256_add_ps(corr, one);
 681     corr    = _mm256_mul_ps(corr, q);
 682     corr    = _mm256_add_ps(corr, one);
 683
 684     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
 685     expmx2  = _mm256_mul_ps(expmx2, corr);
 686
 687     pB1  = _mm256_mul_ps(CB9, w2);
 688     pB0  = _mm256_mul_ps(CB8, w2);
 689     pB1  = _mm256_add_ps(pB1, CB7);
 690     pB0  = _mm256_add_ps(pB0, CB6);
 691     pB1  = _mm256_mul_ps(pB1, w2);
 692     pB0  = _mm256_mul_ps(pB0, w2);
 693     pB1  = _mm256_add_ps(pB1, CB5);
 694     pB0  = _mm256_add_ps(pB0, CB4);
 695     pB1  = _mm256_mul_ps(pB1, w2);
 696     pB0  = _mm256_mul_ps(pB0, w2);
 697     pB1  = _mm256_add_ps(pB1, CB3);
 698     pB0  = _mm256_add_ps(pB0, CB2);
 699     pB1  = _mm256_mul_ps(pB1, w2);
 700     pB0  = _mm256_mul_ps(pB0, w2);
 701     pB1  = _mm256_add_ps(pB1, CB1);
 702     pB1  = _mm256_mul_ps(pB1, w);
 703     pB0  = _mm256_add_ps(pB0, pB1);
 704     pB0  = _mm256_add_ps(pB0, CB0);
 705
 706     pC0  = _mm256_mul_ps(CC10, t2);
 707     pC1  = _mm256_mul_ps(CC9, t2);
 708     pC0  = _mm256_add_ps(pC0, CC8);
 709     pC1  = _mm256_add_ps(pC1, CC7);
 710     pC0  = _mm256_mul_ps(pC0, t2);
 711     pC1  = _mm256_mul_ps(pC1, t2);
 712     pC0  = _mm256_add_ps(pC0, CC6);
 713     pC1  = _mm256_add_ps(pC1, CC5);
 714     pC0  = _mm256_mul_ps(pC0, t2);
 715     pC1  = _mm256_mul_ps(pC1, t2);
 716     pC0  = _mm256_add_ps(pC0, CC4);
 717     pC1  = _mm256_add_ps(pC1, CC3);
 718     pC0  = _mm256_mul_ps(pC0, t2);
 719     pC1  = _mm256_mul_ps(pC1, t2);
 720     pC0  = _mm256_add_ps(pC0, CC2);
 721     pC1  = _mm256_add_ps(pC1, CC1);
 722     pC0  = _mm256_mul_ps(pC0, t2);
 723     pC1  = _mm256_mul_ps(pC1, t);
 724     pC0  = _mm256_add_ps(pC0, pC1);
 725     pC0  = _mm256_add_ps(pC0, CC0);
 726     pC0  = _mm256_mul_ps(pC0, t);
 727
 728     /* SELECT pB0 or pC0 for erfc() */
 729     mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
 730     res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
 731     res_erfc = _mm256_mul_ps(res_erfc, expmx2);
 732
 733     /* erfc(x<0) = 2-erfc(|x|) */
 734     mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
 735     res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
 736
 737     /* Select erf() or erfc() */
 738     mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
 739     res  = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
 740
 741     return res;
 742 }
 743
 744
 745 /* erf(), 128 bit wide */
 746 static __m128
 747 gmx_mm_erf_ps(__m128 x)
 748 {
 749     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 750     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 751     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 752     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 753     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 754     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 755     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 756     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 757     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 758     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 759     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 760     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 761     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 762     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 763     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 764     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 765     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 766     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 767     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 768     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 769     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 770     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 771     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 772     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 773     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 774     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 775     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 776     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 777     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 778     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 779     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 780
 781     /* Coefficients for expansion of exp(x) in [0,0.1] */
 782     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 783     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 784     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 785     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 786
 787     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 788     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 789     const __m128  one      = _mm_set1_ps(1.0f);
 790     const __m128  two      = _mm_set1_ps(2.0f);
 791
 792     __m128        x2, x4, y;
 793     __m128        z, q, t, t2, w, w2;
 794     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 795     __m128        expmx2, corr;
 796     __m128        res_erf, res_erfc, res;
 797     __m128        mask;
 798
 799     /* Calculate erf() */
 800     x2     = _mm_mul_ps(x, x);
 801     x4     = _mm_mul_ps(x2, x2);
 802
 803     pA0  = _mm_mul_ps(CA6, x4);
 804     pA1  = _mm_mul_ps(CA5, x4);
 805     pA0  = _mm_add_ps(pA0, CA4);
 806     pA1  = _mm_add_ps(pA1, CA3);
 807     pA0  = _mm_mul_ps(pA0, x4);
 808     pA1  = _mm_mul_ps(pA1, x4);
 809     pA0  = _mm_add_ps(pA0, CA2);
 810     pA1  = _mm_add_ps(pA1, CA1);
 811     pA0  = _mm_mul_ps(pA0, x4);
 812     pA1  = _mm_mul_ps(pA1, x2);
 813     pA0  = _mm_add_ps(pA0, pA1);
 814     pA0  = _mm_add_ps(pA0, CA0);
 815
 816     res_erf = _mm_mul_ps(x, pA0);
 817
 818     /* Calculate erfc */
 819
 820     y       = gmx_mm_abs_ps(x);
 821     t       = gmx_mm_inv_ps(y);
 822     w       = _mm_sub_ps(t, one);
 823     t2      = _mm_mul_ps(t, t);
 824     w2      = _mm_mul_ps(w, w);
 825     /*
 826      * We cannot simply calculate exp(-x2) directly in single precision, since
 827      * that will lose a couple of bits of precision due to the multiplication.
 828      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 829      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 830      *
 831      * The only drawback with this is that it requires TWO separate exponential
 832      * evaluations, which would be horrible performance-wise. However, the argument
 833      * for the second exp() call is always small, so there we simply use a
 834      * low-order minimax expansion on [0,0.1].
 835      */
 836
 837     z       = _mm_and_ps(y, sieve);
 838     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 839
 840     corr    = _mm_mul_ps(CD4, q);
 841     corr    = _mm_add_ps(corr, CD3);
 842     corr    = _mm_mul_ps(corr, q);
 843     corr    = _mm_add_ps(corr, CD2);
 844     corr    = _mm_mul_ps(corr, q);
 845     corr    = _mm_add_ps(corr, one);
 846     corr    = _mm_mul_ps(corr, q);
 847     corr    = _mm_add_ps(corr, one);
 848
 849     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 850     expmx2  = _mm_mul_ps(expmx2, corr);
 851
 852     pB1  = _mm_mul_ps(CB9, w2);
 853     pB0  = _mm_mul_ps(CB8, w2);
 854     pB1  = _mm_add_ps(pB1, CB7);
 855     pB0  = _mm_add_ps(pB0, CB6);
 856     pB1  = _mm_mul_ps(pB1, w2);
 857     pB0  = _mm_mul_ps(pB0, w2);
 858     pB1  = _mm_add_ps(pB1, CB5);
 859     pB0  = _mm_add_ps(pB0, CB4);
 860     pB1  = _mm_mul_ps(pB1, w2);
 861     pB0  = _mm_mul_ps(pB0, w2);
 862     pB1  = _mm_add_ps(pB1, CB3);
 863     pB0  = _mm_add_ps(pB0, CB2);
 864     pB1  = _mm_mul_ps(pB1, w2);
 865     pB0  = _mm_mul_ps(pB0, w2);
 866     pB1  = _mm_add_ps(pB1, CB1);
 867     pB1  = _mm_mul_ps(pB1, w);
 868     pB0  = _mm_add_ps(pB0, pB1);
 869     pB0  = _mm_add_ps(pB0, CB0);
 870
 871     pC0  = _mm_mul_ps(CC10, t2);
 872     pC1  = _mm_mul_ps(CC9, t2);
 873     pC0  = _mm_add_ps(pC0, CC8);
 874     pC1  = _mm_add_ps(pC1, CC7);
 875     pC0  = _mm_mul_ps(pC0, t2);
 876     pC1  = _mm_mul_ps(pC1, t2);
 877     pC0  = _mm_add_ps(pC0, CC6);
 878     pC1  = _mm_add_ps(pC1, CC5);
 879     pC0  = _mm_mul_ps(pC0, t2);
 880     pC1  = _mm_mul_ps(pC1, t2);
 881     pC0  = _mm_add_ps(pC0, CC4);
 882     pC1  = _mm_add_ps(pC1, CC3);
 883     pC0  = _mm_mul_ps(pC0, t2);
 884     pC1  = _mm_mul_ps(pC1, t2);
 885     pC0  = _mm_add_ps(pC0, CC2);
 886     pC1  = _mm_add_ps(pC1, CC1);
 887     pC0  = _mm_mul_ps(pC0, t2);
 888     pC1  = _mm_mul_ps(pC1, t);
 889     pC0  = _mm_add_ps(pC0, pC1);
 890     pC0  = _mm_add_ps(pC0, CC0);
 891     pC0  = _mm_mul_ps(pC0, t);
 892
 893     /* SELECT pB0 or pC0 for erfc() */
 894     mask     = _mm_cmplt_ps(two, y);
 895     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 896     res_erfc = _mm_mul_ps(res_erfc, expmx2);
 897
 898     /* erfc(x<0) = 2-erfc(|x|) */
 899     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 900     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 901
 902     /* Select erf() or erfc() */
 903     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 904     res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
 905
 906     return res;
 907 }
 908
 909
 910
 911
 912 /* FULL precision erfc(), 256 bit wide. Only errors in LSB */
 913 static __m256
 914 gmx_mm256_erfc_ps(__m256 x)
 915 {
 916     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 917     const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
 918     const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
 919     const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
 920     const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
 921     const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
 922     const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
 923     const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
 924     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 925     const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
 926     const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
 927     const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
 928     const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
 929     const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
 930     const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
 931     const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
 932     const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
 933     const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
 934     const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
 935     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 936     const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
 937     const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
 938     const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
 939     const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
 940     const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
 941     const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
 942     const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
 943     const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
 944     const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
 945     const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
 946     const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
 947
 948     /* Coefficients for expansion of exp(x) in [0,0.1] */
 949     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 950     const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
 951     const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
 952     const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
 953
 954     const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
 955     const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 956     const __m256  one      = _mm256_set1_ps(1.0f);
 957     const __m256  two      = _mm256_set1_ps(2.0f);
 958
 959     __m256        x2, x4, y;
 960     __m256        z, q, t, t2, w, w2;
 961     __m256        pA0, pA1, pB0, pB1, pC0, pC1;
 962     __m256        expmx2, corr;
 963     __m256        res_erf, res_erfc, res;
 964     __m256        mask;
 965
 966     /* Calculate erf() */
 967     x2     = _mm256_mul_ps(x, x);
 968     x4     = _mm256_mul_ps(x2, x2);
 969
 970     pA0  = _mm256_mul_ps(CA6, x4);
 971     pA1  = _mm256_mul_ps(CA5, x4);
 972     pA0  = _mm256_add_ps(pA0, CA4);
 973     pA1  = _mm256_add_ps(pA1, CA3);
 974     pA0  = _mm256_mul_ps(pA0, x4);
 975     pA1  = _mm256_mul_ps(pA1, x4);
 976     pA0  = _mm256_add_ps(pA0, CA2);
 977     pA1  = _mm256_add_ps(pA1, CA1);
 978     pA0  = _mm256_mul_ps(pA0, x4);
 979     pA1  = _mm256_mul_ps(pA1, x2);
 980     pA0  = _mm256_add_ps(pA0, pA1);
 981     pA0  = _mm256_add_ps(pA0, CA0);
 982
 983     res_erf = _mm256_mul_ps(x, pA0);
 984
 985     /* Calculate erfc */
 986     y       = gmx_mm256_abs_ps(x);
 987     t       = gmx_mm256_inv_ps(y);
 988     w       = _mm256_sub_ps(t, one);
 989     t2      = _mm256_mul_ps(t, t);
 990     w2      = _mm256_mul_ps(w, w);
 991     /*
 992      * We cannot simply calculate exp(-x2) directly in single precision, since
 993      * that will lose a couple of bits of precision due to the multiplication.
 994      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 995      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 996      *
 997      * The only drawback with this is that it requires TWO separate exponential
 998      * evaluations, which would be horrible performance-wise. However, the argument
 999      * for the second exp() call is always small, so there we simply use a
1000      * low-order minimax expansion on [0,0.1].
1001      */
1002
1003     z       = _mm256_and_ps(y, sieve);
1004     q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
1005
1006     corr    = _mm256_mul_ps(CD4, q);
1007     corr    = _mm256_add_ps(corr, CD3);
1008     corr    = _mm256_mul_ps(corr, q);
1009     corr    = _mm256_add_ps(corr, CD2);
1010     corr    = _mm256_mul_ps(corr, q);
1011     corr    = _mm256_add_ps(corr, one);
1012     corr    = _mm256_mul_ps(corr, q);
1013     corr    = _mm256_add_ps(corr, one);
1014
1015     expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
1016     expmx2  = _mm256_mul_ps(expmx2, corr);
1017
1018     pB1  = _mm256_mul_ps(CB9, w2);
1019     pB0  = _mm256_mul_ps(CB8, w2);
1020     pB1  = _mm256_add_ps(pB1, CB7);
1021     pB0  = _mm256_add_ps(pB0, CB6);
1022     pB1  = _mm256_mul_ps(pB1, w2);
1023     pB0  = _mm256_mul_ps(pB0, w2);
1024     pB1  = _mm256_add_ps(pB1, CB5);
1025     pB0  = _mm256_add_ps(pB0, CB4);
1026     pB1  = _mm256_mul_ps(pB1, w2);
1027     pB0  = _mm256_mul_ps(pB0, w2);
1028     pB1  = _mm256_add_ps(pB1, CB3);
1029     pB0  = _mm256_add_ps(pB0, CB2);
1030     pB1  = _mm256_mul_ps(pB1, w2);
1031     pB0  = _mm256_mul_ps(pB0, w2);
1032     pB1  = _mm256_add_ps(pB1, CB1);
1033     pB1  = _mm256_mul_ps(pB1, w);
1034     pB0  = _mm256_add_ps(pB0, pB1);
1035     pB0  = _mm256_add_ps(pB0, CB0);
1036
1037     pC0  = _mm256_mul_ps(CC10, t2);
1038     pC1  = _mm256_mul_ps(CC9, t2);
1039     pC0  = _mm256_add_ps(pC0, CC8);
1040     pC1  = _mm256_add_ps(pC1, CC7);
1041     pC0  = _mm256_mul_ps(pC0, t2);
1042     pC1  = _mm256_mul_ps(pC1, t2);
1043     pC0  = _mm256_add_ps(pC0, CC6);
1044     pC1  = _mm256_add_ps(pC1, CC5);
1045     pC0  = _mm256_mul_ps(pC0, t2);
1046     pC1  = _mm256_mul_ps(pC1, t2);
1047     pC0  = _mm256_add_ps(pC0, CC4);
1048     pC1  = _mm256_add_ps(pC1, CC3);
1049     pC0  = _mm256_mul_ps(pC0, t2);
1050     pC1  = _mm256_mul_ps(pC1, t2);
1051     pC0  = _mm256_add_ps(pC0, CC2);
1052     pC1  = _mm256_add_ps(pC1, CC1);
1053     pC0  = _mm256_mul_ps(pC0, t2);
1054     pC1  = _mm256_mul_ps(pC1, t);
1055     pC0  = _mm256_add_ps(pC0, pC1);
1056     pC0  = _mm256_add_ps(pC0, CC0);
1057     pC0  = _mm256_mul_ps(pC0, t);
1058
1059     /* SELECT pB0 or pC0 for erfc() */
1060     mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
1061     res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
1062     res_erfc = _mm256_mul_ps(res_erfc, expmx2);
1063
1064     /* erfc(x<0) = 2-erfc(|x|) */
1065     mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
1066     res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
1067
1068     /* Select erf() or erfc() */
1069     mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
1070     res  = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
1071
1072     return res;
1073 }
1074
1075
1076 /* erfc(), 128 bit wide */
1077 static __m128
1078 gmx_mm_erfc_ps(__m128 x)
1079 {
1080     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
1081     const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
1082     const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
1083     const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
1084     const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
1085     const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
1086     const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
1087     const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
1088     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
1089     const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
1090     const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
1091     const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
1092     const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
1093     const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
1094     const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
1095     const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
1096     const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
1097     const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
1098     const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
1099     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
1100     const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
1101     const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
1102     const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
1103     const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
1104     const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
1105     const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
1106     const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
1107     const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
1108     const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
1109     const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
1110     const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
1111
1112     /* Coefficients for expansion of exp(x) in [0,0.1] */
1113     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
1114     const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
1115     const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
1116     const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
1117
1118     const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
1119     const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1120     const __m128  one      = _mm_set1_ps(1.0f);
1121     const __m128  two      = _mm_set1_ps(2.0f);
1122
1123     __m128        x2, x4, y;
1124     __m128        z, q, t, t2, w, w2;
1125     __m128        pA0, pA1, pB0, pB1, pC0, pC1;
1126     __m128        expmx2, corr;
1127     __m128        res_erf, res_erfc, res;
1128     __m128        mask;
1129
1130     /* Calculate erf() */
1131     x2     = _mm_mul_ps(x, x);
1132     x4     = _mm_mul_ps(x2, x2);
1133
1134     pA0  = _mm_mul_ps(CA6, x4);
1135     pA1  = _mm_mul_ps(CA5, x4);
1136     pA0  = _mm_add_ps(pA0, CA4);
1137     pA1  = _mm_add_ps(pA1, CA3);
1138     pA0  = _mm_mul_ps(pA0, x4);
1139     pA1  = _mm_mul_ps(pA1, x4);
1140     pA0  = _mm_add_ps(pA0, CA2);
1141     pA1  = _mm_add_ps(pA1, CA1);
1142     pA0  = _mm_mul_ps(pA0, x4);
1143     pA1  = _mm_mul_ps(pA1, x2);
1144     pA0  = _mm_add_ps(pA0, pA1);
1145     pA0  = _mm_add_ps(pA0, CA0);
1146
1147     res_erf = _mm_mul_ps(x, pA0);
1148
1149     /* Calculate erfc */
1150     y       = gmx_mm_abs_ps(x);
1151     t       = gmx_mm_inv_ps(y);
1152     w       = _mm_sub_ps(t, one);
1153     t2      = _mm_mul_ps(t, t);
1154     w2      = _mm_mul_ps(w, w);
1155     /*
1156      * We cannot simply calculate exp(-x2) directly in single precision, since
1157      * that will lose a couple of bits of precision due to the multiplication.
1158      * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
1159      * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
1160      *
1161      * The only drawback with this is that it requires TWO separate exponential
1162      * evaluations, which would be horrible performance-wise. However, the argument
1163      * for the second exp() call is always small, so there we simply use a
1164      * low-order minimax expansion on [0,0.1].
1165      */
1166
1167     z       = _mm_and_ps(y, sieve);
1168     q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
1169
1170     corr    = _mm_mul_ps(CD4, q);
1171     corr    = _mm_add_ps(corr, CD3);
1172     corr    = _mm_mul_ps(corr, q);
1173     corr    = _mm_add_ps(corr, CD2);
1174     corr    = _mm_mul_ps(corr, q);
1175     corr    = _mm_add_ps(corr, one);
1176     corr    = _mm_mul_ps(corr, q);
1177     corr    = _mm_add_ps(corr, one);
1178
1179     expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
1180     expmx2  = _mm_mul_ps(expmx2, corr);
1181
1182     pB1  = _mm_mul_ps(CB9, w2);
1183     pB0  = _mm_mul_ps(CB8, w2);
1184     pB1  = _mm_add_ps(pB1, CB7);
1185     pB0  = _mm_add_ps(pB0, CB6);
1186     pB1  = _mm_mul_ps(pB1, w2);
1187     pB0  = _mm_mul_ps(pB0, w2);
1188     pB1  = _mm_add_ps(pB1, CB5);
1189     pB0  = _mm_add_ps(pB0, CB4);
1190     pB1  = _mm_mul_ps(pB1, w2);
1191     pB0  = _mm_mul_ps(pB0, w2);
1192     pB1  = _mm_add_ps(pB1, CB3);
1193     pB0  = _mm_add_ps(pB0, CB2);
1194     pB1  = _mm_mul_ps(pB1, w2);
1195     pB0  = _mm_mul_ps(pB0, w2);
1196     pB1  = _mm_add_ps(pB1, CB1);
1197     pB1  = _mm_mul_ps(pB1, w);
1198     pB0  = _mm_add_ps(pB0, pB1);
1199     pB0  = _mm_add_ps(pB0, CB0);
1200
1201     pC0  = _mm_mul_ps(CC10, t2);
1202     pC1  = _mm_mul_ps(CC9, t2);
1203     pC0  = _mm_add_ps(pC0, CC8);
1204     pC1  = _mm_add_ps(pC1, CC7);
1205     pC0  = _mm_mul_ps(pC0, t2);
1206     pC1  = _mm_mul_ps(pC1, t2);
1207     pC0  = _mm_add_ps(pC0, CC6);
1208     pC1  = _mm_add_ps(pC1, CC5);
1209     pC0  = _mm_mul_ps(pC0, t2);
1210     pC1  = _mm_mul_ps(pC1, t2);
1211     pC0  = _mm_add_ps(pC0, CC4);
1212     pC1  = _mm_add_ps(pC1, CC3);
1213     pC0  = _mm_mul_ps(pC0, t2);
1214     pC1  = _mm_mul_ps(pC1, t2);
1215     pC0  = _mm_add_ps(pC0, CC2);
1216     pC1  = _mm_add_ps(pC1, CC1);
1217     pC0  = _mm_mul_ps(pC0, t2);
1218     pC1  = _mm_mul_ps(pC1, t);
1219     pC0  = _mm_add_ps(pC0, pC1);
1220     pC0  = _mm_add_ps(pC0, CC0);
1221     pC0  = _mm_mul_ps(pC0, t);
1222
1223     /* SELECT pB0 or pC0 for erfc() */
1224     mask     = _mm_cmplt_ps(two, y);
1225     res_erfc = _mm_blendv_ps(pB0, pC0, mask);
1226     res_erfc = _mm_mul_ps(res_erfc, expmx2);
1227
1228     /* erfc(x<0) = 2-erfc(|x|) */
1229     mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
1230     res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
1231
1232     /* Select erf() or erfc() */
1233     mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
1234     res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
1235
1236     return res;
1237 }
1238
1239
1240
1241 /* Calculate the force correction due to PME analytically.
1242  *
1243  * This routine is meant to enable analytical evaluation of the
1244  * direct-space PME electrostatic force to avoid tables.
1245  *
1246  * The direct-space potential should be Erfc(beta*r)/r, but there
1247  * are some problems evaluating that:
1248  *
1249  * First, the error function is difficult (read: expensive) to
1250  * approxmiate accurately for intermediate to large arguments, and
1251  * this happens already in ranges of beta*r that occur in simulations.
1252  * Second, we now try to avoid calculating potentials in Gromacs but
1253  * use forces directly.
1254  *
1255  * We can simply things slight by noting that the PME part is really
1256  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
1257  *
1258  * V= 1/r - Erf(beta*r)/r
1259  *
1260  * The first term we already have from the inverse square root, so
1261  * that we can leave out of this routine.
1262  *
1263  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1264  * the argument beta*r will be in the range 0.15 to ~4. Use your
1265  * favorite plotting program to realize how well-behaved Erf(z)/z is
1266  * in this range!
1267  *
1268  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
1269  * However, it turns out it is more efficient to approximate f(z)/z and
1270  * then only use even powers. This is another minor optimization, since
1271  * we actually WANT f(z)/z, because it is going to be multiplied by
1272  * the vector between the two atoms to get the vectorial force. The
1273  * fastest flops are the ones we can avoid calculating!
1274  *
1275  * So, here's how it should be used:
1276  *
1277  * 1. Calculate r^2.
1278  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1279  * 3. Evaluate this routine with z^2 as the argument.
1280  * 4. The return value is the expression:
1281  *
1282  *
1283  *       2*exp(-z^2)     erf(z)
1284  *       ------------ - --------
1285  *       sqrt(Pi)*z^2      z^3
1286  *
1287  * 5. Multiply the entire expression by beta^3. This will get you
1288  *
1289  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
1290  *       ------------------  - ---------------
1291  *          sqrt(Pi)*z^2            z^3
1292  *
1293  *    or, switching back to r (z=r*beta):
1294  *
1295  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
1296  *       ----------------------- - -----------
1297  *            sqrt(Pi)*r^2            r^3
1298  *
1299  *
1300  *    With a bit of math exercise you should be able to confirm that
1301  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
1302  *
1303  * 6. Add the result to 1/r^3, multiply by the product of the charges,
1304  *    and you have your force (divided by r). A final multiplication
1305  *    with the vector connecting the two particles and you have your
1306  *    vectorial force to add to the particles.
1307  *
1308  */
1309 static __m256
1310 gmx_mm256_pmecorrF_ps(__m256 z2)
1311 {
1312     const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
1313     const __m256  FN5      = _mm256_set1_ps(1.4703624142580877519e-6f);
1314     const __m256  FN4      = _mm256_set1_ps(-0.000053401640219807709149f);
1315     const __m256  FN3      = _mm256_set1_ps(0.0010054721316683106153f);
1316     const __m256  FN2      = _mm256_set1_ps(-0.019278317264888380590f);
1317     const __m256  FN1      = _mm256_set1_ps(0.069670166153766424023f);
1318     const __m256  FN0      = _mm256_set1_ps(-0.75225204789749321333f);
1319
1320     const __m256  FD4      = _mm256_set1_ps(0.0011193462567257629232f);
1321     const __m256  FD3      = _mm256_set1_ps(0.014866955030185295499f);
1322     const __m256  FD2      = _mm256_set1_ps(0.11583842382862377919f);
1323     const __m256  FD1      = _mm256_set1_ps(0.50736591960530292870f);
1324     const __m256  FD0      = _mm256_set1_ps(1.0f);
1325
1326     __m256        z4;
1327     __m256        polyFN0, polyFN1, polyFD0, polyFD1;
1328
1329     z4             = _mm256_mul_ps(z2, z2);
1330
1331     polyFD0        = _mm256_mul_ps(FD4, z4);
1332     polyFD1        = _mm256_mul_ps(FD3, z4);
1333     polyFD0        = _mm256_add_ps(polyFD0, FD2);
1334     polyFD1        = _mm256_add_ps(polyFD1, FD1);
1335     polyFD0        = _mm256_mul_ps(polyFD0, z4);
1336     polyFD1        = _mm256_mul_ps(polyFD1, z2);
1337     polyFD0        = _mm256_add_ps(polyFD0, FD0);
1338     polyFD0        = _mm256_add_ps(polyFD0, polyFD1);
1339
1340     polyFD0        = gmx_mm256_inv_ps(polyFD0);
1341
1342     polyFN0        = _mm256_mul_ps(FN6, z4);
1343     polyFN1        = _mm256_mul_ps(FN5, z4);
1344     polyFN0        = _mm256_add_ps(polyFN0, FN4);
1345     polyFN1        = _mm256_add_ps(polyFN1, FN3);
1346     polyFN0        = _mm256_mul_ps(polyFN0, z4);
1347     polyFN1        = _mm256_mul_ps(polyFN1, z4);
1348     polyFN0        = _mm256_add_ps(polyFN0, FN2);
1349     polyFN1        = _mm256_add_ps(polyFN1, FN1);
1350     polyFN0        = _mm256_mul_ps(polyFN0, z4);
1351     polyFN1        = _mm256_mul_ps(polyFN1, z2);
1352     polyFN0        = _mm256_add_ps(polyFN0, FN0);
1353     polyFN0        = _mm256_add_ps(polyFN0, polyFN1);
1354
1355     return _mm256_mul_ps(polyFN0, polyFD0);
1356 }
1357
1358
1359 static __m128
1360 gmx_mm_pmecorrF_ps(__m128 z2)
1361 {
1362     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
1363     const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
1364     const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
1365     const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
1366     const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
1367     const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
1368     const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
1369
1370     const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
1371     const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
1372     const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
1373     const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
1374     const __m128  FD0      = _mm_set1_ps(1.0f);
1375
1376     __m128        z4;
1377     __m128        polyFN0, polyFN1, polyFD0, polyFD1;
1378
1379     z4             = _mm_mul_ps(z2, z2);
1380
1381     polyFD0        = _mm_mul_ps(FD4, z4);
1382     polyFD1        = _mm_mul_ps(FD3, z4);
1383     polyFD0        = _mm_add_ps(polyFD0, FD2);
1384     polyFD1        = _mm_add_ps(polyFD1, FD1);
1385     polyFD0        = _mm_mul_ps(polyFD0, z4);
1386     polyFD1        = _mm_mul_ps(polyFD1, z2);
1387     polyFD0        = _mm_add_ps(polyFD0, FD0);
1388     polyFD0        = _mm_add_ps(polyFD0, polyFD1);
1389
1390     polyFD0        = gmx_mm_inv_ps(polyFD0);
1391
1392     polyFN0        = _mm_mul_ps(FN6, z4);
1393     polyFN1        = _mm_mul_ps(FN5, z4);
1394     polyFN0        = _mm_add_ps(polyFN0, FN4);
1395     polyFN1        = _mm_add_ps(polyFN1, FN3);
1396     polyFN0        = _mm_mul_ps(polyFN0, z4);
1397     polyFN1        = _mm_mul_ps(polyFN1, z4);
1398     polyFN0        = _mm_add_ps(polyFN0, FN2);
1399     polyFN1        = _mm_add_ps(polyFN1, FN1);
1400     polyFN0        = _mm_mul_ps(polyFN0, z4);
1401     polyFN1        = _mm_mul_ps(polyFN1, z2);
1402     polyFN0        = _mm_add_ps(polyFN0, FN0);
1403     polyFN0        = _mm_add_ps(polyFN0, polyFN1);
1404
1405     return _mm_mul_ps(polyFN0, polyFD0);
1406 }
1407
1408
1409
1410 /* Calculate the potential correction due to PME analytically.
1411  *
1412  * See gmx_mm256_pmecorrF_ps() for details about the approximation.
1413  *
1414  * This routine calculates Erf(z)/z, although you should provide z^2
1415  * as the input argument.
1416  *
1417  * Here's how it should be used:
1418  *
1419  * 1. Calculate r^2.
1420  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
1421  * 3. Evaluate this routine with z^2 as the argument.
1422  * 4. The return value is the expression:
1423  *
1424  *
1425  *        erf(z)
1426  *       --------
1427  *          z
1428  *
1429  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
1430  *
1431  *       erf(r*beta)
1432  *       -----------
1433  *           r
1434  *
1435  * 6. Subtract the result from 1/r, multiply by the product of the charges,
1436  *    and you have your potential.
1437  */
1438 static __m256
1439 gmx_mm256_pmecorrV_ps(__m256 z2)
1440 {
1441     const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
1442     const __m256  VN5      = _mm256_set1_ps(-1.4213390571557850962e-6f);
1443     const __m256  VN4      = _mm256_set1_ps(0.000041603292906656984871f);
1444     const __m256  VN3      = _mm256_set1_ps(-0.00013134036773265025626f);
1445     const __m256  VN2      = _mm256_set1_ps(0.038657983986041781264f);
1446     const __m256  VN1      = _mm256_set1_ps(0.11285044772717598220f);
1447     const __m256  VN0      = _mm256_set1_ps(1.1283802385263030286f);
1448
1449     const __m256  VD3      = _mm256_set1_ps(0.0066752224023576045451f);
1450     const __m256  VD2      = _mm256_set1_ps(0.078647795836373922256f);
1451     const __m256  VD1      = _mm256_set1_ps(0.43336185284710920150f);
1452     const __m256  VD0      = _mm256_set1_ps(1.0f);
1453
1454     __m256        z4;
1455     __m256        polyVN0, polyVN1, polyVD0, polyVD1;
1456
1457     z4             = _mm256_mul_ps(z2, z2);
1458
1459     polyVD1        = _mm256_mul_ps(VD3, z4);
1460     polyVD0        = _mm256_mul_ps(VD2, z4);
1461     polyVD1        = _mm256_add_ps(polyVD1, VD1);
1462     polyVD0        = _mm256_add_ps(polyVD0, VD0);
1463     polyVD1        = _mm256_mul_ps(polyVD1, z2);
1464     polyVD0        = _mm256_add_ps(polyVD0, polyVD1);
1465
1466     polyVD0        = gmx_mm256_inv_ps(polyVD0);
1467
1468     polyVN0        = _mm256_mul_ps(VN6, z4);
1469     polyVN1        = _mm256_mul_ps(VN5, z4);
1470     polyVN0        = _mm256_add_ps(polyVN0, VN4);
1471     polyVN1        = _mm256_add_ps(polyVN1, VN3);
1472     polyVN0        = _mm256_mul_ps(polyVN0, z4);
1473     polyVN1        = _mm256_mul_ps(polyVN1, z4);
1474     polyVN0        = _mm256_add_ps(polyVN0, VN2);
1475     polyVN1        = _mm256_add_ps(polyVN1, VN1);
1476     polyVN0        = _mm256_mul_ps(polyVN0, z4);
1477     polyVN1        = _mm256_mul_ps(polyVN1, z2);
1478     polyVN0        = _mm256_add_ps(polyVN0, VN0);
1479     polyVN0        = _mm256_add_ps(polyVN0, polyVN1);
1480
1481     return _mm256_mul_ps(polyVN0, polyVD0);
1482 }
1483
1484
1485 static __m128
1486 gmx_mm_pmecorrV_ps(__m128 z2)
1487 {
1488     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
1489     const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
1490     const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
1491     const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
1492     const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
1493     const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
1494     const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
1495
1496     const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
1497     const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
1498     const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
1499     const __m128  VD0      = _mm_set1_ps(1.0f);
1500
1501     __m128        z4;
1502     __m128        polyVN0, polyVN1, polyVD0, polyVD1;
1503
1504     z4             = _mm_mul_ps(z2, z2);
1505
1506     polyVD1        = _mm_mul_ps(VD3, z4);
1507     polyVD0        = _mm_mul_ps(VD2, z4);
1508     polyVD1        = _mm_add_ps(polyVD1, VD1);
1509     polyVD0        = _mm_add_ps(polyVD0, VD0);
1510     polyVD1        = _mm_mul_ps(polyVD1, z2);
1511     polyVD0        = _mm_add_ps(polyVD0, polyVD1);
1512
1513     polyVD0        = gmx_mm_inv_ps(polyVD0);
1514
1515     polyVN0        = _mm_mul_ps(VN6, z4);
1516     polyVN1        = _mm_mul_ps(VN5, z4);
1517     polyVN0        = _mm_add_ps(polyVN0, VN4);
1518     polyVN1        = _mm_add_ps(polyVN1, VN3);
1519     polyVN0        = _mm_mul_ps(polyVN0, z4);
1520     polyVN1        = _mm_mul_ps(polyVN1, z4);
1521     polyVN0        = _mm_add_ps(polyVN0, VN2);
1522     polyVN1        = _mm_add_ps(polyVN1, VN1);
1523     polyVN0        = _mm_mul_ps(polyVN0, z4);
1524     polyVN1        = _mm_mul_ps(polyVN1, z2);
1525     polyVN0        = _mm_add_ps(polyVN0, VN0);
1526     polyVN0        = _mm_add_ps(polyVN0, polyVN1);
1527
1528     return _mm_mul_ps(polyVN0, polyVD0);
1529 }
1530
1531
1532 static int
1533 gmx_mm256_sincos_ps(__m256  x,
1534                     __m256 *sinval,
1535                     __m256 *cosval)
1536 {
1537     const __m256  two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
1538     const __m256  half        = _mm256_set1_ps(0.5f);
1539     const __m256  one         = _mm256_set1_ps(1.0f);
1540     const __m256  zero        = _mm256_setzero_ps();
1541
1542     const __m128i ione       = _mm_set1_epi32(1);
1543
1544     const __m256  mask_one    = _mm256_castsi256_ps(_mm256_set1_epi32(1));
1545     const __m256  mask_two    = _mm256_castsi256_ps(_mm256_set1_epi32(2));
1546     const __m256  mask_three  = _mm256_castsi256_ps(_mm256_set1_epi32(3));
1547
1548     const __m256  CA1         = _mm256_set1_ps(1.5703125f);
1549     const __m256  CA2         = _mm256_set1_ps(4.837512969970703125e-4f);
1550     const __m256  CA3         = _mm256_set1_ps(7.54978995489188216e-8f);
1551
1552     const __m256  CC0         = _mm256_set1_ps(-0.0013602249f);
1553     const __m256  CC1         = _mm256_set1_ps(0.0416566950f);
1554     const __m256  CC2         = _mm256_set1_ps(-0.4999990225f);
1555     const __m256  CS0         = _mm256_set1_ps(-0.0001950727f);
1556     const __m256  CS1         = _mm256_set1_ps(0.0083320758f);
1557     const __m256  CS2         = _mm256_set1_ps(-0.1666665247f);
1558
1559     const __m256  signbit    = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1560
1561     __m256        y, y2;
1562     __m256        z;
1563     __m256i       iz;
1564     __m128i       iz_high, iz_low;
1565     __m256        offset_sin, offset_cos;
1566     __m256        mask_sin, mask_cos;
1567     __m256        tmp1, tmp2;
1568     __m256        tmp_sin, tmp_cos;
1569
1570     y               = _mm256_mul_ps(x, two_over_pi);
1571     y               = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
1572
1573     iz              = _mm256_cvttps_epi32(y);
1574     z               = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
1575
1576     offset_sin      = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
1577
1578     iz_high         = _mm256_extractf128_si256(iz, 0x1);
1579     iz_low          = _mm256_castsi256_si128(iz);
1580     iz_low          = _mm_add_epi32(iz_low, ione);
1581     iz_high         = _mm_add_epi32(iz_high, ione);
1582     iz              = _mm256_castsi128_si256(iz_low);
1583     iz              = _mm256_insertf128_si256(iz, iz_high, 0x1);
1584     offset_cos      = _mm256_castsi256_ps(iz);
1585
1586     /* Extended precision arithmethic to achieve full precision */
1587     y               = _mm256_mul_ps(z, CA1);
1588     tmp1            = _mm256_mul_ps(z, CA2);
1589     tmp2            = _mm256_mul_ps(z, CA3);
1590     y               = _mm256_sub_ps(x, y);
1591     y               = _mm256_sub_ps(y, tmp1);
1592     y               = _mm256_sub_ps(y, tmp2);
1593
1594     y2              = _mm256_mul_ps(y, y);
1595
1596     tmp1            = _mm256_mul_ps(CC0, y2);
1597     tmp1            = _mm256_add_ps(tmp1, CC1);
1598     tmp2            = _mm256_mul_ps(CS0, y2);
1599     tmp2            = _mm256_add_ps(tmp2, CS1);
1600     tmp1            = _mm256_mul_ps(tmp1, y2);
1601     tmp1            = _mm256_add_ps(tmp1, CC2);
1602     tmp2            = _mm256_mul_ps(tmp2, y2);
1603     tmp2            = _mm256_add_ps(tmp2, CS2);
1604
1605     tmp1            = _mm256_mul_ps(tmp1, y2);
1606     tmp1            = _mm256_add_ps(tmp1, one);
1607
1608     tmp2            = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
1609     tmp2            = _mm256_add_ps(tmp2, y);
1610
1611 #ifdef __INTEL_COMPILER
1612     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1613     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
1614     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
1615 #else
1616     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
1617     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
1618 #endif
1619     tmp_sin         = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
1620     tmp_cos         = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
1621
1622     tmp1            = _mm256_xor_ps(signbit, tmp_sin);
1623     tmp2            = _mm256_xor_ps(signbit, tmp_cos);
1624
1625 #ifdef __INTEL_COMPILER
1626     /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
1627     mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
1628     mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
1629 #else
1630     mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
1631     mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
1632
1633 #endif
1634     *sinval         = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
1635     *cosval         = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
1636
1637     return 0;
1638 }
1639
1640 static int
1641 gmx_mm_sincos_ps(__m128  x,
1642                  __m128 *sinval,
1643                  __m128 *cosval)
1644 {
1645     const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
1646     const __m128  half        = _mm_set1_ps(0.5);
1647     const __m128  one         = _mm_set1_ps(1.0);
1648
1649     const __m128i izero      = _mm_set1_epi32(0);
1650     const __m128i ione       = _mm_set1_epi32(1);
1651     const __m128i itwo       = _mm_set1_epi32(2);
1652     const __m128i ithree     = _mm_set1_epi32(3);
1653     const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1654
1655     const __m128  CA1         = _mm_set1_ps(1.5703125f);
1656     const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
1657     const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
1658
1659     const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
1660     const __m128  CC1         = _mm_set1_ps(0.0416566950f);
1661     const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
1662     const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
1663     const __m128  CS1         = _mm_set1_ps(0.0083320758f);
1664     const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
1665
1666     __m128        y, y2;
1667     __m128        z;
1668     __m128i       iz;
1669     __m128i       offset_sin, offset_cos;
1670     __m128        tmp1, tmp2;
1671     __m128        mask_sin, mask_cos;
1672     __m128        tmp_sin, tmp_cos;
1673
1674     y          = _mm_mul_ps(x, two_over_pi);
1675     y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
1676
1677     iz         = _mm_cvttps_epi32(y);
1678     z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
1679
1680     offset_sin = _mm_and_si128(iz, ithree);
1681     offset_cos = _mm_add_epi32(iz, ione);
1682
1683     /* Extended precision arithmethic to achieve full precision */
1684     y               = _mm_mul_ps(z, CA1);
1685     tmp1            = _mm_mul_ps(z, CA2);
1686     tmp2            = _mm_mul_ps(z, CA3);
1687     y               = _mm_sub_ps(x, y);
1688     y               = _mm_sub_ps(y, tmp1);
1689     y               = _mm_sub_ps(y, tmp2);
1690
1691     y2              = _mm_mul_ps(y, y);
1692
1693     tmp1            = _mm_mul_ps(CC0, y2);
1694     tmp1            = _mm_add_ps(tmp1, CC1);
1695     tmp2            = _mm_mul_ps(CS0, y2);
1696     tmp2            = _mm_add_ps(tmp2, CS1);
1697     tmp1            = _mm_mul_ps(tmp1, y2);
1698     tmp1            = _mm_add_ps(tmp1, CC2);
1699     tmp2            = _mm_mul_ps(tmp2, y2);
1700     tmp2            = _mm_add_ps(tmp2, CS2);
1701
1702     tmp1            = _mm_mul_ps(tmp1, y2);
1703     tmp1            = _mm_add_ps(tmp1, one);
1704
1705     tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
1706     tmp2            = _mm_add_ps(tmp2, y);
1707
1708     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
1709     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
1710
1711     tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
1712     tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
1713
1714     mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
1715     mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
1716
1717     tmp1            = _mm_xor_ps(signbit, tmp_sin);
1718     tmp2            = _mm_xor_ps(signbit, tmp_cos);
1719
1720     *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
1721     *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
1722
1723     return 0;
1724 }
1725
1726
1727
1728
1729 /*
1730  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1731  * will then call the sincos() routine and waste a factor 2 in performance!
1732  */
1733 static __m256
1734 gmx_mm256_sin_ps(__m256 x)
1735 {
1736     __m256 s, c;
1737     gmx_mm256_sincos_ps(x, &s, &c);
1738     return s;
1739 }
1740
1741 static __m128
1742 gmx_mm_sin_ps(__m128 x)
1743 {
1744     __m128 s, c;
1745     gmx_mm_sincos_ps(x, &s, &c);
1746     return s;
1747 }
1748
1749
1750 /*
1751  * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
1752  * will then call the sincos() routine and waste a factor 2 in performance!
1753  */
1754 static __m256
1755 gmx_mm256_cos_ps(__m256 x)
1756 {
1757     __m256 s, c;
1758     gmx_mm256_sincos_ps(x, &s, &c);
1759     return c;
1760 }
1761
1762 static __m128
1763 gmx_mm_cos_ps(__m128 x)
1764 {
1765     __m128 s, c;
1766     gmx_mm_sincos_ps(x, &s, &c);
1767     return c;
1768 }
1769
1770
1771 static __m256
1772 gmx_mm256_tan_ps(__m256 x)
1773 {
1774     __m256 sinval, cosval;
1775     __m256 tanval;
1776
1777     gmx_mm256_sincos_ps(x, &sinval, &cosval);
1778
1779     tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
1780
1781     return tanval;
1782 }
1783
1784 static __m128
1785 gmx_mm_tan_ps(__m128 x)
1786 {
1787     __m128 sinval, cosval;
1788     __m128 tanval;
1789
1790     gmx_mm_sincos_ps(x, &sinval, &cosval);
1791
1792     tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
1793
1794     return tanval;
1795 }
1796
1797
1798 static __m256
1799 gmx_mm256_asin_ps(__m256 x)
1800 {
1801     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1802     const __m256 limitlow  = _mm256_set1_ps(1e-4f);
1803     const __m256 half      = _mm256_set1_ps(0.5f);
1804     const __m256 one       = _mm256_set1_ps(1.0f);
1805     const __m256 halfpi    = _mm256_set1_ps((float)M_PI/2.0f);
1806
1807     const __m256 CC5        = _mm256_set1_ps(4.2163199048E-2f);
1808     const __m256 CC4        = _mm256_set1_ps(2.4181311049E-2f);
1809     const __m256 CC3        = _mm256_set1_ps(4.5470025998E-2f);
1810     const __m256 CC2        = _mm256_set1_ps(7.4953002686E-2f);
1811     const __m256 CC1        = _mm256_set1_ps(1.6666752422E-1f);
1812
1813     __m256       sign;
1814     __m256       mask;
1815     __m256       xabs;
1816     __m256       z, z1, z2, q, q1, q2;
1817     __m256       pA, pB;
1818
1819     sign  = _mm256_andnot_ps(signmask, x);
1820     xabs  = _mm256_and_ps(x, signmask);
1821
1822     mask  = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
1823
1824     z1    = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
1825     q1    = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
1826     q1    = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
1827
1828     q2    = xabs;
1829     z2    = _mm256_mul_ps(q2, q2);
1830
1831     z     = _mm256_blendv_ps(z2, z1, mask);
1832     q     = _mm256_blendv_ps(q2, q1, mask);
1833
1834     z2    = _mm256_mul_ps(z, z);
1835
1836     pA    = _mm256_mul_ps(CC5, z2);
1837     pB    = _mm256_mul_ps(CC4, z2);
1838
1839     pA    = _mm256_add_ps(pA, CC3);
1840     pB    = _mm256_add_ps(pB, CC2);
1841
1842     pA    = _mm256_mul_ps(pA, z2);
1843     pB    = _mm256_mul_ps(pB, z2);
1844
1845     pA    = _mm256_add_ps(pA, CC1);
1846     pA    = _mm256_mul_ps(pA, z);
1847
1848     z     = _mm256_add_ps(pA, pB);
1849     z     = _mm256_mul_ps(z, q);
1850     z     = _mm256_add_ps(z, q);
1851
1852     q2    = _mm256_sub_ps(halfpi, z);
1853     q2    = _mm256_sub_ps(q2, z);
1854
1855     z     = _mm256_blendv_ps(z, q2, mask);
1856
1857     mask  = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
1858     z     = _mm256_blendv_ps(xabs, z, mask);
1859
1860     z     = _mm256_xor_ps(z, sign);
1861
1862     return z;
1863 }
1864
1865 static __m128
1866 gmx_mm_asin_ps(__m128 x)
1867 {
1868     /* Same algorithm as cephes library */
1869     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1870     const __m128 limitlow  = _mm_set1_ps(1e-4f);
1871     const __m128 half      = _mm_set1_ps(0.5f);
1872     const __m128 one       = _mm_set1_ps(1.0f);
1873     const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
1874
1875     const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
1876     const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
1877     const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
1878     const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
1879     const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
1880
1881     __m128       sign;
1882     __m128       mask;
1883     __m128       xabs;
1884     __m128       z, z1, z2, q, q1, q2;
1885     __m128       pA, pB;
1886
1887     sign  = _mm_andnot_ps(signmask, x);
1888     xabs  = _mm_and_ps(x, signmask);
1889
1890     mask  = _mm_cmpgt_ps(xabs, half);
1891
1892     z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
1893     q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
1894     q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
1895
1896     q2    = xabs;
1897     z2    = _mm_mul_ps(q2, q2);
1898
1899     z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
1900     q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
1901
1902     z2    = _mm_mul_ps(z, z);
1903
1904     pA    = _mm_mul_ps(CC5, z2);
1905     pB    = _mm_mul_ps(CC4, z2);
1906
1907     pA    = _mm_add_ps(pA, CC3);
1908     pB    = _mm_add_ps(pB, CC2);
1909
1910     pA    = _mm_mul_ps(pA, z2);
1911     pB    = _mm_mul_ps(pB, z2);
1912
1913     pA    = _mm_add_ps(pA, CC1);
1914     pA    = _mm_mul_ps(pA, z);
1915
1916     z     = _mm_add_ps(pA, pB);
1917     z     = _mm_mul_ps(z, q);
1918     z     = _mm_add_ps(z, q);
1919
1920     q2    = _mm_sub_ps(halfpi, z);
1921     q2    = _mm_sub_ps(q2, z);
1922
1923     z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
1924
1925     mask  = _mm_cmpgt_ps(xabs, limitlow);
1926     z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
1927
1928     z = _mm_xor_ps(z, sign);
1929
1930     return z;
1931 }
1932
1933
1934 static __m256
1935 gmx_mm256_acos_ps(__m256 x)
1936 {
1937     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
1938     const __m256 one_ps    = _mm256_set1_ps(1.0f);
1939     const __m256 half_ps   = _mm256_set1_ps(0.5f);
1940     const __m256 pi_ps     = _mm256_set1_ps((float)M_PI);
1941     const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
1942
1943     __m256       mask1;
1944     __m256       mask2;
1945     __m256       xabs;
1946     __m256       z, z1, z2, z3;
1947
1948     xabs  = _mm256_and_ps(x, signmask);
1949     mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
1950     mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
1951
1952     z     = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
1953     z     = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
1954     z     = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
1955
1956     z     = _mm256_blendv_ps(x, z, mask1);
1957     z     = gmx_mm256_asin_ps(z);
1958
1959     z2    = _mm256_add_ps(z, z);
1960     z1    = _mm256_sub_ps(pi_ps, z2);
1961     z3    = _mm256_sub_ps(halfpi_ps, z);
1962
1963     z     = _mm256_blendv_ps(z1, z2, mask2);
1964     z     = _mm256_blendv_ps(z3, z, mask1);
1965
1966     return z;
1967 }
1968
1969 static __m128
1970 gmx_mm_acos_ps(__m128 x)
1971 {
1972     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
1973     const __m128 one_ps    = _mm_set1_ps(1.0f);
1974     const __m128 half_ps   = _mm_set1_ps(0.5f);
1975     const __m128 pi_ps     = _mm_set1_ps(M_PI);
1976     const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
1977
1978     __m128       mask1;
1979     __m128       mask2;
1980     __m128       xabs;
1981     __m128       z, z1, z2, z3;
1982
1983     xabs  = _mm_and_ps(x, signmask);
1984     mask1 = _mm_cmpgt_ps(xabs, half_ps);
1985     mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
1986
1987     z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
1988     z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
1989     z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
1990
1991     z     = _mm_blendv_ps(x, z, mask1);
1992     z     = gmx_mm_asin_ps(z);
1993
1994     z2    = _mm_add_ps(z, z);
1995     z1    = _mm_sub_ps(pi_ps, z2);
1996     z3    = _mm_sub_ps(halfpi_ps, z);
1997
1998     z     = _mm_blendv_ps(z1, z2, mask2);
1999     z     = _mm_blendv_ps(z3, z, mask1);
2000
2001     return z;
2002 }
2003
2004
2005 static __m256
2006 gmx_mm256_atan_ps(__m256 x)
2007 {
2008     const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
2009     const __m256 limit1    = _mm256_set1_ps(0.414213562373095f);
2010     const __m256 limit2    = _mm256_set1_ps(2.414213562373095f);
2011     const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
2012     const __m256 halfpi    = _mm256_set1_ps(1.570796326794896f);
2013     const __m256 mone      = _mm256_set1_ps(-1.0f);
2014     const __m256 CC3       = _mm256_set1_ps(-3.33329491539E-1f);
2015     const __m256 CC5       = _mm256_set1_ps(1.99777106478E-1f);
2016     const __m256 CC7       = _mm256_set1_ps(-1.38776856032E-1);
2017     const __m256 CC9       = _mm256_set1_ps(8.05374449538e-2f);
2018
2019     __m256       sign;
2020     __m256       mask1, mask2;
2021     __m256       y, z1, z2;
2022     __m256       x2, x4;
2023     __m256       sum1, sum2;
2024
2025     sign  = _mm256_andnot_ps(signmask, x);
2026     x     = _mm256_and_ps(x, signmask);
2027
2028     mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
2029     mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
2030
2031     z1    = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
2032     z2    = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
2033
2034     y     = _mm256_and_ps(mask1, quarterpi);
2035     y     = _mm256_blendv_ps(y, halfpi, mask2);
2036
2037     x     = _mm256_blendv_ps(x, z1, mask1);
2038     x     = _mm256_blendv_ps(x, z2, mask2);
2039
2040     x2    = _mm256_mul_ps(x, x);
2041     x4    = _mm256_mul_ps(x2, x2);
2042
2043     sum1  = _mm256_mul_ps(CC9, x4);
2044     sum2  = _mm256_mul_ps(CC7, x4);
2045     sum1  = _mm256_add_ps(sum1, CC5);
2046     sum2  = _mm256_add_ps(sum2, CC3);
2047     sum1  = _mm256_mul_ps(sum1, x4);
2048     sum2  = _mm256_mul_ps(sum2, x2);
2049
2050     sum1  = _mm256_add_ps(sum1, sum2);
2051     sum1  = _mm256_sub_ps(sum1, mone);
2052     sum1  = _mm256_mul_ps(sum1, x);
2053     y     = _mm256_add_ps(y, sum1);
2054
2055     y     = _mm256_xor_ps(y, sign);
2056
2057     return y;
2058 }
2059
2060 static __m128
2061 gmx_mm_atan_ps(__m128 x)
2062 {
2063     /* Same algorithm as cephes library */
2064     const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
2065     const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
2066     const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
2067     const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
2068     const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
2069     const __m128 mone      = _mm_set1_ps(-1.0f);
2070     const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
2071     const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
2072     const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
2073     const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
2074
2075     __m128       sign;
2076     __m128       mask1, mask2;
2077     __m128       y, z1, z2;
2078     __m128       x2, x4;
2079     __m128       sum1, sum2;
2080
2081     sign  = _mm_andnot_ps(signmask, x);
2082     x     = _mm_and_ps(x, signmask);
2083
2084     mask1 = _mm_cmpgt_ps(x, limit1);
2085     mask2 = _mm_cmpgt_ps(x, limit2);
2086
2087     z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
2088     z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
2089
2090     y     = _mm_and_ps(mask1, quarterpi);
2091     y     = _mm_blendv_ps(y, halfpi, mask2);
2092
2093     x     = _mm_blendv_ps(x, z1, mask1);
2094     x     = _mm_blendv_ps(x, z2, mask2);
2095
2096     x2    = _mm_mul_ps(x, x);
2097     x4    = _mm_mul_ps(x2, x2);
2098
2099     sum1  = _mm_mul_ps(CC9, x4);
2100     sum2  = _mm_mul_ps(CC7, x4);
2101     sum1  = _mm_add_ps(sum1, CC5);
2102     sum2  = _mm_add_ps(sum2, CC3);
2103     sum1  = _mm_mul_ps(sum1, x4);
2104     sum2  = _mm_mul_ps(sum2, x2);
2105
2106     sum1  = _mm_add_ps(sum1, sum2);
2107     sum1  = _mm_sub_ps(sum1, mone);
2108     sum1  = _mm_mul_ps(sum1, x);
2109     y     = _mm_add_ps(y, sum1);
2110
2111     y     = _mm_xor_ps(y, sign);
2112
2113     return y;
2114 }
2115
2116
2117 static __m256
2118 gmx_mm256_atan2_ps(__m256 y, __m256 x)
2119 {
2120     const __m256 pi          = _mm256_set1_ps( (float) M_PI);
2121     const __m256 minuspi     = _mm256_set1_ps( (float) -M_PI);
2122     const __m256 halfpi      = _mm256_set1_ps( (float) M_PI/2.0f);
2123     const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
2124
2125     __m256       z, z1, z3, z4;
2126     __m256       w;
2127     __m256       maskx_lt, maskx_eq;
2128     __m256       masky_lt, masky_eq;
2129     __m256       mask1, mask2, mask3, mask4, maskall;
2130
2131     maskx_lt  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
2132     masky_lt  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
2133     maskx_eq  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
2134     masky_eq  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
2135
2136     z         = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
2137     z         = gmx_mm256_atan_ps(z);
2138
2139     mask1     = _mm256_and_ps(maskx_eq, masky_lt);
2140     mask2     = _mm256_andnot_ps(maskx_lt, masky_eq);
2141     mask3     = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
2142     mask4     = _mm256_and_ps(maskx_lt, masky_eq);
2143     maskall   = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
2144
2145     z         = _mm256_andnot_ps(maskall, z);
2146     z1        = _mm256_and_ps(mask1, minushalfpi);
2147     z3        = _mm256_and_ps(mask3, halfpi);
2148     z4        = _mm256_and_ps(mask4, pi);
2149
2150     z         = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
2151
2152     w         = _mm256_blendv_ps(pi, minuspi, masky_lt);
2153     w         = _mm256_and_ps(w, maskx_lt);
2154
2155     w         = _mm256_andnot_ps(maskall, w);
2156
2157     z         = _mm256_add_ps(z, w);
2158
2159     return z;
2160 }
2161
2162 static __m128
2163 gmx_mm_atan2_ps(__m128 y, __m128 x)
2164 {
2165     const __m128 pi          = _mm_set1_ps(M_PI);
2166     const __m128 minuspi     = _mm_set1_ps(-M_PI);
2167     const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
2168     const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
2169
2170     __m128       z, z1, z3, z4;
2171     __m128       w;
2172     __m128       maskx_lt, maskx_eq;
2173     __m128       masky_lt, masky_eq;
2174     __m128       mask1, mask2, mask3, mask4, maskall;
2175
2176     maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
2177     masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
2178     maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
2179     masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
2180
2181     z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
2182     z         = gmx_mm_atan_ps(z);
2183
2184     mask1     = _mm_and_ps(maskx_eq, masky_lt);
2185     mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
2186     mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
2187     mask4     = _mm_and_ps(masky_eq, maskx_lt);
2188
2189     maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
2190
2191     z         = _mm_andnot_ps(maskall, z);
2192     z1        = _mm_and_ps(mask1, minushalfpi);
2193     z3        = _mm_and_ps(mask3, halfpi);
2194     z4        = _mm_and_ps(mask4, pi);
2195
2196     z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
2197
2198     mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
2199     mask2     = _mm_and_ps(maskx_lt, masky_lt);
2200
2201     w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
2202     w         = _mm_andnot_ps(maskall, w);
2203
2204     z         = _mm_add_ps(z, w);
2205
2206     return z;
2207 }
2208
2209 #endif /* _gmx_math_x86_avx_256_single_h_ */