src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_SIMD_MATH_H
  36 #define GMX_SIMD_SIMD_MATH_H
  37
  38 /*! \libinternal \file
  39  *
  40  * \brief Math functions for SIMD datatypes.
  41  *
  42  * \attention This file is generic for all SIMD architectures, so you cannot
  43  * assume that any of the optional SIMD features (as defined in simd.h) are
  44  * present. In particular, this means you cannot assume support for integers,
  45  * logical operations (neither on floating-point nor integer values), shifts,
  46  * and the architecture might only have SIMD for either float or double.
  47  * Second, to keep this file clean and general, any additions to this file
  48  * must work for all possible SIMD architectures in both single and double
  49  * precision (if they support it), and you cannot make any assumptions about
  50  * SIMD width.
  51  *
  52  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  53  *
  54  * \inlibraryapi
  55  * \ingroup module_simd
  56  */
  57
  58 #include "config.h"
  59
  60 #include <cmath>
  61
  62 #include <limits>
  63
  64 #include "gromacs/math/utilities.h"
  65 #include "gromacs/simd/simd.h"
  66 #include "gromacs/utility/basedefinitions.h"
  67 #include "gromacs/utility/real.h"
  68
  69 namespace gmx
  70 {
  71
  72 #if GMX_SIMD
  73
  74 /*! \cond libapi */
  75 /*! \addtogroup module_simd */
  76 /*! \{ */
  77
  78 /*! \name Implementation accuracy settings
  79  *  \{
  80  */
  81
  82 /*! \} */
  83
  84 #    if GMX_SIMD_HAVE_FLOAT
  85
  86 /*! \name Single precision SIMD math functions
  87  *
  88  *  \note In most cases you should use the real-precision functions instead.
  89  *  \{
  90  */
  91
  92 /****************************************
  93  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  94  ****************************************/
  95
  96 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
  97 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
  98  *
  99  * \param x Values to set sign for
 100  * \param y Values used to set sign
 101  * \return  Magnitude of x, sign of y
 102  */
 103 static inline SimdFloat gmx_simdcall copysign(SimdFloat x, SimdFloat y)
 104 {
 105 #            if GMX_SIMD_HAVE_LOGICAL
 106     return abs(x) | (SimdFloat(GMX_FLOAT_NEGZERO) & y);
 107 #            else
 108     return blend(abs(x), -abs(x), y < setZero());
 109 #            endif
 110 }
 111 #        endif
 112
 113 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
 114 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 115  *
 116  * This is a low-level routine that should only be used by SIMD math routine
 117  * that evaluates the inverse square root.
 118  *
 119  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 120  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 121  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 122  */
 123 static inline SimdFloat gmx_simdcall rsqrtIter(SimdFloat lu, SimdFloat x)
 124 {
 125     SimdFloat tmp1 = x * lu;
 126     SimdFloat tmp2 = SimdFloat(-0.5F) * lu;
 127     tmp1           = fma(tmp1, lu, SimdFloat(-3.0F));
 128     return tmp1 * tmp2;
 129 }
 130 #        endif
 131
 132 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 133  *
 134  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 135  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 136  *           For the single precision implementation this is obviously always
 137  *           true for positive values, but for double precision it adds an
 138  *           extra restriction since the first lookup step might have to be
 139  *           performed in single precision on some architectures. Note that the
 140  *           responsibility for checking falls on you - this routine does not
 141  *           check arguments.
 142  *
 143  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 144  */
 145 static inline SimdFloat gmx_simdcall invsqrt(SimdFloat x)
 146 {
 147     SimdFloat lu = rsqrt(x);
 148 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 149     lu = rsqrtIter(lu, x);
 150 #        endif
 151 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 152     lu = rsqrtIter(lu, x);
 153 #        endif
 154 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 155     lu = rsqrtIter(lu, x);
 156 #        endif
 157     return lu;
 158 }
 159
 160 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 161  *
 162  * \param x0  First set of arguments, x0 must be in single range (see below).
 163  * \param x1  Second set of arguments, x1 must be in single range (see below).
 164  * \param[out] out0  Result 1/sqrt(x0)
 165  * \param[out] out1  Result 1/sqrt(x1)
 166  *
 167  *  In particular for double precision we can sometimes calculate square root
 168  *  pairs slightly faster by using single precision until the very last step.
 169  *
 170  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
 171  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
 172  *       For the single precision implementation this is obviously always
 173  *       true for positive values, but for double precision it adds an
 174  *       extra restriction since the first lookup step might have to be
 175  *       performed in single precision on some architectures. Note that the
 176  *       responsibility for checking falls on you - this routine does not
 177  *       check arguments.
 178  */
 179 static inline void gmx_simdcall invsqrtPair(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
 180 {
 181     *out0 = invsqrt(x0);
 182     *out1 = invsqrt(x1);
 183 }
 184
 185 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
 186 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 187  *
 188  * This is a low-level routine that should only be used by SIMD math routine
 189  * that evaluates the reciprocal.
 190  *
 191  *  \param lu Approximation of 1/x, typically obtained from lookup.
 192  *  \param x  The reference (starting) value x for which we want 1/x.
 193  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 194  */
 195 static inline SimdFloat gmx_simdcall rcpIter(SimdFloat lu, SimdFloat x)
 196 {
 197     return lu * fnma(lu, x, SimdFloat(2.0F));
 198 }
 199 #        endif
 200
 201 /*! \brief Calculate 1/x for SIMD float.
 202  *
 203  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 204  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 205  *           For the single precision implementation this is obviously always
 206  *           true for positive values, but for double precision it adds an
 207  *           extra restriction since the first lookup step might have to be
 208  *           performed in single precision on some architectures. Note that the
 209  *           responsibility for checking falls on you - this routine does not
 210  *           check arguments.
 211  *
 212  *  \return 1/x. Result is undefined if your argument was invalid.
 213  */
 214 static inline SimdFloat gmx_simdcall inv(SimdFloat x)
 215 {
 216     SimdFloat lu = rcp(x);
 217 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 218     lu = rcpIter(lu, x);
 219 #        endif
 220 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 221     lu = rcpIter(lu, x);
 222 #        endif
 223 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 224     lu = rcpIter(lu, x);
 225 #        endif
 226     return lu;
 227 }
 228
 229 /*! \brief Division for SIMD floats
 230  *
 231  * \param nom    Nominator
 232  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
 233  *               For single precision this is equivalent to a nonzero argument,
 234  *               but in double precision it adds an extra restriction since
 235  *               the first lookup step might have to be performed in single
 236  *               precision on some architectures. Note that the responsibility
 237  *               for checking falls on you - this routine does not check arguments.
 238  *
 239  * \return nom/denom
 240  *
 241  * \note This function does not use any masking to avoid problems with
 242  *       zero values in the denominator.
 243  */
 244 static inline SimdFloat gmx_simdcall operator/(SimdFloat nom, SimdFloat denom)
 245 {
 246     return nom * inv(denom);
 247 }
 248
 249 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
 250  *
 251  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
 252  *  Illegal values in the masked-out elements will not lead to
 253  *  floating-point exceptions.
 254  *
 255  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 256  *           GMX_FLOAT_MAX for masked-in entries.
 257  *           See \ref invsqrt for the discussion about argument restrictions.
 258  *  \param m Mask
 259  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
 260  *          entry was not masked, and 0.0 for masked-out entries.
 261  */
 262 static inline SimdFloat maskzInvsqrt(SimdFloat x, SimdFBool m)
 263 {
 264     SimdFloat lu = maskzRsqrt(x, m);
 265 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 266     lu = rsqrtIter(lu, x);
 267 #        endif
 268 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 269     lu = rsqrtIter(lu, x);
 270 #        endif
 271 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 272     lu = rsqrtIter(lu, x);
 273 #        endif
 274     return lu;
 275 }
 276
 277 /*! \brief Calculate 1/x for SIMD float, masked version.
 278  *
 279  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 280  *           GMX_FLOAT_MAX for masked-in entries.
 281  *           See \ref invsqrt for the discussion about argument restrictions.
 282  *  \param m Mask
 283  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
 284  */
 285 static inline SimdFloat gmx_simdcall maskzInv(SimdFloat x, SimdFBool m)
 286 {
 287     SimdFloat lu = maskzRcp(x, m);
 288 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 289     lu = rcpIter(lu, x);
 290 #        endif
 291 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 292     lu = rcpIter(lu, x);
 293 #        endif
 294 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 295     lu = rcpIter(lu, x);
 296 #        endif
 297     return lu;
 298 }
 299
 300 /*! \brief Calculate sqrt(x) for SIMD floats
 301  *
 302  *  \tparam opt By default, this function checks if the input value is 0.0
 303  *              and masks this to return the correct result. If you are certain
 304  *              your argument will never be zero, and you know you need to save
 305  *              every single cycle you can, you can alternatively call the
 306  *              function as sqrt<MathOptimization::Unsafe>(x).
 307  *
 308  *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
 309  *              lookup step often has to be implemented in single precision.
 310  *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
 311  *              result, even in double precision. If you are using the unsafe
 312  *              math optimization parameter, the argument must be in the range
 313  *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
 314  *
 315  *  \return sqrt(x). The result is undefined if the input value does not fall
 316  *          in the allowed range specified for the argument.
 317  */
 318 template<MathOptimization opt = MathOptimization::Safe>
 319 static inline SimdFloat gmx_simdcall sqrt(SimdFloat x)
 320 {
 321     if (opt == MathOptimization::Safe)
 322     {
 323         SimdFloat res = maskzInvsqrt(x, setZero() < x);
 324         return res * x;
 325     }
 326     else
 327     {
 328         return x * invsqrt(x);
 329     }
 330 }
 331
 332 /*! \brief Cube root for SIMD floats
 333  *
 334  * \param x      Argument to calculate cube root of. Can be negative or zero,
 335  *               but NaN or Inf values are not supported. Denormal values will
 336  *               be treated as 0.0.
 337  * \return       Cube root of x.
 338  */
 339 static inline SimdFloat gmx_simdcall cbrt(SimdFloat x)
 340 {
 341     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 342     const SimdFloat minFloat(std::numeric_limits<float>::min());
 343     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 344     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 345     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 346     // division mixed with FP, we let the divided value (42) be the original constant.
 347     const std::int32_t offsetDiv3(42);
 348     const SimdFloat    c2(-0.191502161678719066F);
 349     const SimdFloat    c1(0.697570460207922770F);
 350     const SimdFloat    c0(0.492659620528969547F);
 351     const SimdFloat    one(1.0F);
 352     const SimdFloat    two(2.0F);
 353     const SimdFloat    three(3.0F);
 354     const SimdFloat    oneThird(1.0F / 3.0F);
 355     const SimdFloat    cbrt2(1.2599210498948731648F);
 356     const SimdFloat    sqrCbrt2(1.5874010519681994748F);
 357
 358     // To calculate cbrt(x) we first take the absolute value of x but save the sign,
 359     // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
 360     // the main step.
 361     // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
 362     // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
 363     // The cube root can the be evaluated by calculating the cube root of the fraction
 364     // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
 365     // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
 366     // n is the integer part of the original exponent divided by 3.
 367
 368     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 369     SimdFloat xAbs     = andNot(signBit, x);   // select everthing but the sign bit => abs(x)
 370     SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
 371
 372     SimdFInt32 exponent;
 373     SimdFloat  y = frexp(xAbs, &exponent);
 374     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 375     // by first using a polynomial and then evaluating
 376     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 377     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 378     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 379     SimdFloat w        = z * z * z;
 380     SimdFloat nom      = z * fma(two, y, w);
 381     SimdFloat invDenom = inv(fma(two, w, y));
 382
 383     // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
 384     // division converted to multiplication... but we can't do that since our SIMD layer cannot
 385     // assume the presence of integer shift operations!
 386     // However, when I first worked with the integer algorithm I still came up with a neat
 387     // optimization, so I'll describe the full algorithm here in case we ever want to use it
 388     // in the future:
 389     //
 390     // Our dividend is signed, which is a complication, but let's consider the unsigned case
 391     // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
 392     // our dividend is less than 16 bits (exponent range) we can accomplish this by
 393     // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
 394     // right-shifting by 16 bits to divide out the 2^16 part.
 395     // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
 396     // To handle the signed exponent one alternative would be to take absolute values, saving
 397     // signs, etc - but that gets a bit complicated with 2-complement integers.
 398     // Instead, we remember that we don't really want the exact division per se - what we're
 399     // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
 400     // we require that m must be positive (fewer cases to handle) instead of having n as the
 401     // strict e/3.
 402     // To handle this we start by adding 127 to the exponent. This value corresponds to the
 403     // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
 404     // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
 405     // and then multiply by 21845 to get a division result offsetExpDiv3.
 406     // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
 407     // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
 408     // the extra 1 corrects for the value we added to the exponent to get correct division.
 409     // This remainder would have the value 0,1, or 2, but since we only use it to select
 410     // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
 411     //
 412     // OK; end of long detour. Here's how we actually do it in our implementation by using
 413     // floating-point for the exponent instead to avoid needing integer shifts:
 414     //
 415     // 1) Convert the exponent (obtained from frexp) to a float
 416     // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
 417     //    do floating-point division instead of our integer hack, so it's the exponent bias-1, or
 418     //    the largest exponent minus 2.
 419     // 3) Divide the float by 3 by multiplying with 1/3
 420     // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
 421     //    combination with floating-point, because many integers cannot be represented exactly in
 422     //    floating point, and if we are just epsilon below the result might be truncated to a lower
 423     //    integer. I have not observed this on x86, but to have a safety margin we can add a small
 424     //    fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
 425     //    We can even save this extra floating-point addition by adding a small fraction (0.1) when
 426     //    we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
 427     // 5) Get the remainder part by subtracting the truncated floating-point part.
 428     //    Here too we will have a plain division, so the remainder is a strict modulus
 429     //    and will have the values 0, 1 or 2.
 430     //
 431     // Before worrying about the few wasted cycles due to longer fp latency, this has the
 432     // additional advantage that we don't use a single integer operation, so the algorithm
 433     // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
 434
 435     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 436     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 437
 438     SimdFloat offsetExpDiv3 =
 439             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 440
 441     SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offsetDiv3)));
 442
 443     SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
 444
 445     // If remainder is 0 we should just have the factor 1.0,
 446     // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
 447     SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
 448     // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
 449     factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
 450
 451     // Assemble the non-signed fraction, and add the sign back by xor
 452     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 453     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 454     SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
 455
 456     return result;
 457 }
 458
 459 /*! \brief Inverse cube root for SIMD floats
 460  *
 461  * \param x      Argument to calculate cube root of. Can be positive or
 462  *               negative, but the magnitude cannot be lower than
 463  *               the smallest normal number.
 464  * \return       Cube root of x. Undefined for values that don't
 465  *               fulfill the restriction of abs(x) > minFloat.
 466  */
 467 static inline SimdFloat gmx_simdcall invcbrt(SimdFloat x)
 468 {
 469     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 470     const SimdFloat minFloat(std::numeric_limits<float>::min());
 471     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 472     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 473     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 474     // division mixed with FP, we let the divided value (42) be the original constant.
 475     const std::int32_t offsetDiv3(42);
 476     const SimdFloat    c2(-0.191502161678719066F);
 477     const SimdFloat    c1(0.697570460207922770F);
 478     const SimdFloat    c0(0.492659620528969547F);
 479     const SimdFloat    one(1.0F);
 480     const SimdFloat    two(2.0F);
 481     const SimdFloat    three(3.0F);
 482     const SimdFloat    oneThird(1.0F / 3.0F);
 483     const SimdFloat    invCbrt2(1.0F / 1.2599210498948731648F);
 484     const SimdFloat    invSqrCbrt2(1.0F / 1.5874010519681994748F);
 485
 486     // We use pretty much exactly the same implementation as for cbrt(x),
 487     // but to compute the inverse we swap the nominator/denominator
 488     // in the quotient, and also swap the sign of the exponent parts.
 489
 490     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 491     SimdFloat xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
 492
 493     SimdFInt32 exponent;
 494     SimdFloat  y = frexp(xAbs, &exponent);
 495     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 496     // by first using a polynomial and then evaluating
 497     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 498     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 499     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 500     SimdFloat w        = z * z * z;
 501     SimdFloat nom      = fma(two, w, y);
 502     SimdFloat invDenom = inv(z * fma(two, y, w));
 503
 504     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 505     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 506     SimdFloat offsetExpDiv3 =
 507             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 508
 509     // We should swap the sign here, so we change order of the terms in the subtraction
 510     SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offsetDiv3)) - offsetExpDiv3);
 511
 512     // Swap sign here too, so remainder is either 0, -1 or -2
 513     SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
 514
 515     // If remainder is 0 we should just have the factor 1.0,
 516     // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
 517     SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5));
 518     // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
 519     factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
 520
 521     // Assemble the non-signed fraction, and add the sign back by xor
 522     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 523     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 524     SimdFloat result = ldexp(fraction, expDiv3);
 525
 526     return result;
 527 }
 528
 529 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
 530  *
 531  * \param x Argument, should be >0.
 532  * \result The base-2 logarithm of x. Undefined if argument is invalid.
 533  */
 534 static inline SimdFloat gmx_simdcall log2(SimdFloat x)
 535 {
 536     // This implementation computes log2 by
 537     // 1) Extracting the exponent and adding it to...
 538     // 2) A 9th-order minimax approximation using only odd
 539     //    terms of (x-1)/(x+1), where x is the mantissa.
 540
 541 #        if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 542     // Just rescale if native log2() is not present, but log() is.
 543     return log(x) * SimdFloat(std::log2(std::exp(1.0)));
 544 #        else
 545     const SimdFloat one(1.0F);
 546     const SimdFloat two(2.0F);
 547     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 548     const SimdFloat CL9(0.342149508897807708152F);
 549     const SimdFloat CL7(0.411570606888219447939F);
 550     const SimdFloat CL5(0.577085979152320294183F);
 551     const SimdFloat CL3(0.961796550607099898222F);
 552     const SimdFloat CL1(2.885390081777926774009F);
 553     SimdFloat       fExp, x2, p;
 554     SimdFBool       m;
 555     SimdFInt32      iExp;
 556
 557     x    = frexp(x, &iExp);
 558     fExp = cvtI2R(iExp);
 559
 560     m = x < invsqrt2;
 561     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 562     fExp = fExp - selectByMask(one, m);
 563     x    = x * blend(one, two, m);
 564
 565     x  = (x - one) * inv(x + one);
 566     x2 = x * x;
 567
 568     p = fma(CL9, x2, CL7);
 569     p = fma(p, x2, CL5);
 570     p = fma(p, x2, CL3);
 571     p = fma(p, x2, CL1);
 572     p = fma(p, x, fExp);
 573
 574     return p;
 575 #        endif
 576 }
 577
 578 #        if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 579 /*! \brief SIMD float log(x). This is the natural logarithm.
 580  *
 581  * \param x Argument, should be >0.
 582  * \result The natural logarithm of x. Undefined if argument is invalid.
 583  */
 584 static inline SimdFloat gmx_simdcall log(SimdFloat x)
 585 {
 586     const SimdFloat one(1.0F);
 587     const SimdFloat two(2.0F);
 588     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 589     const SimdFloat corr(0.693147180559945286226764F);
 590     const SimdFloat CL9(0.2371599674224853515625F);
 591     const SimdFloat CL7(0.285279005765914916992188F);
 592     const SimdFloat CL5(0.400005519390106201171875F);
 593     const SimdFloat CL3(0.666666567325592041015625F);
 594     const SimdFloat CL1(2.0F);
 595     SimdFloat       fExp, x2, p;
 596     SimdFBool       m;
 597     SimdFInt32      iExp;
 598
 599     x    = frexp(x, &iExp);
 600     fExp = cvtI2R(iExp);
 601
 602     m = x < invsqrt2;
 603     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 604     fExp = fExp - selectByMask(one, m);
 605     x    = x * blend(one, two, m);
 606
 607     x  = (x - one) * inv(x + one);
 608     x2 = x * x;
 609
 610     p = fma(CL9, x2, CL7);
 611     p = fma(p, x2, CL5);
 612     p = fma(p, x2, CL3);
 613     p = fma(p, x2, CL1);
 614     p = fma(p, x, corr * fExp);
 615
 616     return p;
 617 }
 618 #        endif
 619
 620 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
 621 /*! \brief SIMD float 2^x
 622  *
 623  * \tparam opt If this is changed from the default (safe) into the unsafe
 624  *             option, input values that would otherwise lead to zero-clamped
 625  *             results are not allowed and will lead to undefined results.
 626  *
 627  * \param x Argument. For the default (safe) function version this can be
 628  *          arbitrarily small value, but the routine might clamp the result to
 629  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 630  *          This corresponds to inputs below -126 in single or -1022 in double,
 631  *          and it might overflow for arguments reaching 127 (single) or
 632  *          1023 (double). If you enable the unsafe math optimization,
 633  *          very small arguments will not necessarily be zero-clamped, but
 634  *          can produce undefined results.
 635  *
 636  * \result 2^x. The result is undefined for very large arguments that cause
 637  *          internal floating-point overflow. If unsafe optimizations are enabled,
 638  *          this is also true for very small values.
 639  *
 640  * \note    The definition range of this function is just-so-slightly smaller
 641  *          than the allowed IEEE exponents for many architectures. This is due
 642  *          to the implementation, which will hopefully improve in the future.
 643  *
 644  * \warning You cannot rely on this implementation returning inf for arguments
 645  *          that cause overflow. If you have some very large
 646  *          values and need to rely on getting a valid numerical output,
 647  *          take the minimum of your variable and the largest valid argument
 648  *          before calling this routine.
 649  */
 650 template<MathOptimization opt = MathOptimization::Safe>
 651 static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
 652 {
 653     const SimdFloat CC6(0.0001534581200287996416911311F);
 654     const SimdFloat CC5(0.001339993121934088894618990F);
 655     const SimdFloat CC4(0.009618488957115180159497841F);
 656     const SimdFloat CC3(0.05550328776964726865751735F);
 657     const SimdFloat CC2(0.2402264689063408646490722F);
 658     const SimdFloat CC1(0.6931472057372680777553816F);
 659     const SimdFloat one(1.0F);
 660
 661     SimdFloat intpart;
 662     SimdFloat fexppart;
 663     SimdFloat p;
 664
 665     // Large negative values are valid arguments to exp2(), so there are two
 666     // things we need to account for:
 667     // 1. When the exponents reaches -127, the (biased) exponent field will be
 668     //    zero and we can no longer multiply with it. There are special IEEE
 669     //    formats to handle this range, but for now we have to accept that
 670     //    we cannot handle those arguments. If input value becomes even more
 671     //    negative, it will start to loop and we would end up with invalid
 672     //    exponents. Thus, we need to limit or mask this.
 673     // 2. For VERY large negative values, we will have problems that the
 674     //    subtraction to get the fractional part loses accuracy, and then we
 675     //    can end up with overflows in the polynomial.
 676     //
 677     // For now, we handle this by forwarding the math optimization setting to
 678     // ldexp, where the routine will return zero for very small arguments.
 679     //
 680     // However, before doing that we need to make sure we do not call cvtR2I
 681     // with an argument that is so negative it cannot be converted to an integer.
 682     if (opt == MathOptimization::Safe)
 683     {
 684         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
 685     }
 686
 687     fexppart = ldexp<opt>(one, cvtR2I(x));
 688     intpart  = round(x);
 689     x        = x - intpart;
 690
 691     p = fma(CC6, x, CC5);
 692     p = fma(p, x, CC4);
 693     p = fma(p, x, CC3);
 694     p = fma(p, x, CC2);
 695     p = fma(p, x, CC1);
 696     p = fma(p, x, one);
 697     x = p * fexppart;
 698     return x;
 699 }
 700 #        endif
 701
 702 #        if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
 703 /*! \brief SIMD float exp(x).
 704  *
 705  * In addition to scaling the argument for 2^x this routine correctly does
 706  * extended precision arithmetics to improve accuracy.
 707  *
 708  * \tparam opt If this is changed from the default (safe) into the unsafe
 709  *             option, input values that would otherwise lead to zero-clamped
 710  *             results are not allowed and will lead to undefined results.
 711  *
 712  * \param x Argument. For the default (safe) function version this can be
 713  *          arbitrarily small value, but the routine might clamp the result to
 714  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 715  *          This corresponds to input arguments reaching
 716  *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
 717  *          Similarly, it might overflow for arguments reaching
 718  *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
 719  *          unsafe math optimizations are enabled, small input values that would
 720  *          result in zero-clamped output are not allowed.
 721  *
 722  * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
 723  *         depending on the underlying implementation. If unsafe optimizations
 724  *         are enabled, this is also true for very small values.
 725  *
 726  * \note    The definition range of this function is just-so-slightly smaller
 727  *          than the allowed IEEE exponents for many architectures. This is due
 728  *          to the implementation, which will hopefully improve in the future.
 729  *
 730  * \warning You cannot rely on this implementation returning inf for arguments
 731  *          that cause overflow. If you have some very large
 732  *          values and need to rely on getting a valid numerical output,
 733  *          take the minimum of your variable and the largest valid argument
 734  *          before calling this routine.
 735  */
 736 template<MathOptimization opt = MathOptimization::Safe>
 737 static inline SimdFloat gmx_simdcall exp(SimdFloat x)
 738 {
 739     const SimdFloat argscale(1.44269504088896341F);
 740     const SimdFloat invargscale0(-0.693145751953125F);
 741     const SimdFloat invargscale1(-1.428606765330187045e-06F);
 742     const SimdFloat CC4(0.00136324646882712841033936F);
 743     const SimdFloat CC3(0.00836596917361021041870117F);
 744     const SimdFloat CC2(0.0416710823774337768554688F);
 745     const SimdFloat CC1(0.166665524244308471679688F);
 746     const SimdFloat CC0(0.499999850988388061523438F);
 747     const SimdFloat one(1.0F);
 748     SimdFloat       fexppart;
 749     SimdFloat       intpart;
 750     SimdFloat       y, p;
 751
 752     // Large negative values are valid arguments to exp2(), so there are two
 753     // things we need to account for:
 754     // 1. When the exponents reaches -127, the (biased) exponent field will be
 755     //    zero and we can no longer multiply with it. There are special IEEE
 756     //    formats to handle this range, but for now we have to accept that
 757     //    we cannot handle those arguments. If input value becomes even more
 758     //    negative, it will start to loop and we would end up with invalid
 759     //    exponents. Thus, we need to limit or mask this.
 760     // 2. For VERY large negative values, we will have problems that the
 761     //    subtraction to get the fractional part loses accuracy, and then we
 762     //    can end up with overflows in the polynomial.
 763     //
 764     // For now, we handle this by forwarding the math optimization setting to
 765     // ldexp, where the routine will return zero for very small arguments.
 766     //
 767     // However, before doing that we need to make sure we do not call cvtR2I
 768     // with an argument that is so negative it cannot be converted to an integer
 769     // after being multiplied by argscale.
 770
 771     if (opt == MathOptimization::Safe)
 772     {
 773         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()) / argscale);
 774     }
 775
 776     y = x * argscale;
 777
 778
 779     fexppart = ldexp<opt>(one, cvtR2I(y));
 780     intpart  = round(y);
 781
 782     // Extended precision arithmetics
 783     x = fma(invargscale0, intpart, x);
 784     x = fma(invargscale1, intpart, x);
 785
 786     p = fma(CC4, x, CC3);
 787     p = fma(p, x, CC2);
 788     p = fma(p, x, CC1);
 789     p = fma(p, x, CC0);
 790     p = fma(x * x, p, x);
 791 #            if GMX_SIMD_HAVE_FMA
 792     x = fma(p, fexppart, fexppart);
 793 #            else
 794     x = (p + one) * fexppart;
 795 #            endif
 796     return x;
 797 }
 798 #        endif
 799
 800 /*! \brief SIMD float pow(x,y)
 801  *
 802  * This returns x^y for SIMD values.
 803  *
 804  * \tparam opt If this is changed from the default (safe) into the unsafe
 805  *             option, there are no guarantees about correct results for x==0.
 806  *
 807  * \param x Base.
 808  *
 809  * \param y exponent.
 810
 811  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
 812  *         depending on the underlying implementation. If unsafe optimizations
 813  *         are enabled, this is also true for x==0.
 814  *
 815  * \warning You cannot rely on this implementation returning inf for arguments
 816  *          that cause overflow. If you have some very large
 817  *          values and need to rely on getting a valid numerical output,
 818  *          take the minimum of your variable and the largest valid argument
 819  *          before calling this routine.
 820  */
 821 template<MathOptimization opt = MathOptimization::Safe>
 822 static inline SimdFloat gmx_simdcall pow(SimdFloat x, SimdFloat y)
 823 {
 824     SimdFloat xcorr;
 825
 826     if (opt == MathOptimization::Safe)
 827     {
 828         xcorr = max(x, SimdFloat(std::numeric_limits<float>::min()));
 829     }
 830     else
 831     {
 832         xcorr = x;
 833     }
 834
 835     SimdFloat result = exp2<opt>(y * log2(xcorr));
 836
 837     if (opt == MathOptimization::Safe)
 838     {
 839         // if x==0 and y>0 we explicitly set the result to 0.0
 840         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
 841         result = blend(result, setZero(), x == setZero() && setZero() < y);
 842     }
 843
 844     return result;
 845 }
 846
 847
 848 /*! \brief SIMD float erf(x).
 849  *
 850  * \param x The value to calculate erf(x) for.
 851  * \result erf(x)
 852  *
 853  * This routine achieves very close to full precision, but we do not care about
 854  * the last bit or the subnormal result range.
 855  */
 856 static inline SimdFloat gmx_simdcall erf(SimdFloat x)
 857 {
 858     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 859     const SimdFloat CA6(7.853861353153693e-5F);
 860     const SimdFloat CA5(-8.010193625184903e-4F);
 861     const SimdFloat CA4(5.188327685732524e-3F);
 862     const SimdFloat CA3(-2.685381193529856e-2F);
 863     const SimdFloat CA2(1.128358514861418e-1F);
 864     const SimdFloat CA1(-3.761262582423300e-1F);
 865     const SimdFloat CA0(1.128379165726710F);
 866     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 867     const SimdFloat CB9(-0.0018629930017603923F);
 868     const SimdFloat CB8(0.003909821287598495F);
 869     const SimdFloat CB7(-0.0052094582210355615F);
 870     const SimdFloat CB6(0.005685614362160572F);
 871     const SimdFloat CB5(-0.0025367682853477272F);
 872     const SimdFloat CB4(-0.010199799682318782F);
 873     const SimdFloat CB3(0.04369575504816542F);
 874     const SimdFloat CB2(-0.11884063474674492F);
 875     const SimdFloat CB1(0.2732120154030589F);
 876     const SimdFloat CB0(0.42758357702025784F);
 877     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 878     const SimdFloat CC10(-0.0445555913112064F);
 879     const SimdFloat CC9(0.21376355144663348F);
 880     const SimdFloat CC8(-0.3473187200259257F);
 881     const SimdFloat CC7(0.016690861551248114F);
 882     const SimdFloat CC6(0.7560973182491192F);
 883     const SimdFloat CC5(-1.2137903600145787F);
 884     const SimdFloat CC4(0.8411872321232948F);
 885     const SimdFloat CC3(-0.08670413896296343F);
 886     const SimdFloat CC2(-0.27124782687240334F);
 887     const SimdFloat CC1(-0.0007502488047806069F);
 888     const SimdFloat CC0(0.5642114853803148F);
 889     const SimdFloat one(1.0F);
 890     const SimdFloat two(2.0F);
 891
 892     SimdFloat x2, x4, y;
 893     SimdFloat t, t2, w, w2;
 894     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
 895     SimdFloat expmx2;
 896     SimdFloat res_erf, res_erfc, res;
 897     SimdFBool m, maskErf;
 898
 899     // Calculate erf()
 900     x2 = x * x;
 901     x4 = x2 * x2;
 902
 903     pA0 = fma(CA6, x4, CA4);
 904     pA1 = fma(CA5, x4, CA3);
 905     pA0 = fma(pA0, x4, CA2);
 906     pA1 = fma(pA1, x4, CA1);
 907     pA0 = pA0 * x4;
 908     pA0 = fma(pA1, x2, pA0);
 909     // Constant term must come last for precision reasons
 910     pA0 = pA0 + CA0;
 911
 912     res_erf = x * pA0;
 913
 914     // Calculate erfc
 915     y       = abs(x);
 916     maskErf = SimdFloat(0.75F) <= y;
 917     t       = maskzInv(y, maskErf);
 918     w       = t - one;
 919     t2      = t * t;
 920     w2      = w * w;
 921
 922     // No need for a floating-point sieve here (as in erfc), since erf()
 923     // will never return values that are extremely small for large args.
 924     expmx2 = exp(-y * y);
 925
 926     pB1 = fma(CB9, w2, CB7);
 927     pB0 = fma(CB8, w2, CB6);
 928     pB1 = fma(pB1, w2, CB5);
 929     pB0 = fma(pB0, w2, CB4);
 930     pB1 = fma(pB1, w2, CB3);
 931     pB0 = fma(pB0, w2, CB2);
 932     pB1 = fma(pB1, w2, CB1);
 933     pB0 = fma(pB0, w2, CB0);
 934     pB0 = fma(pB1, w, pB0);
 935
 936     pC0 = fma(CC10, t2, CC8);
 937     pC1 = fma(CC9, t2, CC7);
 938     pC0 = fma(pC0, t2, CC6);
 939     pC1 = fma(pC1, t2, CC5);
 940     pC0 = fma(pC0, t2, CC4);
 941     pC1 = fma(pC1, t2, CC3);
 942     pC0 = fma(pC0, t2, CC2);
 943     pC1 = fma(pC1, t2, CC1);
 944
 945     pC0 = fma(pC0, t2, CC0);
 946     pC0 = fma(pC1, t, pC0);
 947     pC0 = pC0 * t;
 948
 949     // Select pB0 or pC0 for erfc()
 950     m        = two < y;
 951     res_erfc = blend(pB0, pC0, m);
 952     res_erfc = res_erfc * expmx2;
 953
 954     // erfc(x<0) = 2-erfc(|x|)
 955     m        = x < setZero();
 956     res_erfc = blend(res_erfc, two - res_erfc, m);
 957
 958     // Select erf() or erfc()
 959     res = blend(res_erf, one - res_erfc, maskErf);
 960
 961     return res;
 962 }
 963
 964 /*! \brief SIMD float erfc(x).
 965  *
 966  * \param x The value to calculate erfc(x) for.
 967  * \result erfc(x)
 968  *
 969  * This routine achieves full precision (bar the last bit) over most of the
 970  * input range, but for large arguments where the result is getting close
 971  * to the minimum representable numbers we accept slightly larger errors
 972  * (think results that are in the ballpark of 10^-30 for single precision)
 973  * since that is not relevant for MD.
 974  */
 975 static inline SimdFloat gmx_simdcall erfc(SimdFloat x)
 976 {
 977     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 978     const SimdFloat CA6(7.853861353153693e-5F);
 979     const SimdFloat CA5(-8.010193625184903e-4F);
 980     const SimdFloat CA4(5.188327685732524e-3F);
 981     const SimdFloat CA3(-2.685381193529856e-2F);
 982     const SimdFloat CA2(1.128358514861418e-1F);
 983     const SimdFloat CA1(-3.761262582423300e-1F);
 984     const SimdFloat CA0(1.128379165726710F);
 985     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 986     const SimdFloat CB9(-0.0018629930017603923F);
 987     const SimdFloat CB8(0.003909821287598495F);
 988     const SimdFloat CB7(-0.0052094582210355615F);
 989     const SimdFloat CB6(0.005685614362160572F);
 990     const SimdFloat CB5(-0.0025367682853477272F);
 991     const SimdFloat CB4(-0.010199799682318782F);
 992     const SimdFloat CB3(0.04369575504816542F);
 993     const SimdFloat CB2(-0.11884063474674492F);
 994     const SimdFloat CB1(0.2732120154030589F);
 995     const SimdFloat CB0(0.42758357702025784F);
 996     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 997     const SimdFloat CC10(-0.0445555913112064F);
 998     const SimdFloat CC9(0.21376355144663348F);
 999     const SimdFloat CC8(-0.3473187200259257F);
1000     const SimdFloat CC7(0.016690861551248114F);
1001     const SimdFloat CC6(0.7560973182491192F);
1002     const SimdFloat CC5(-1.2137903600145787F);
1003     const SimdFloat CC4(0.8411872321232948F);
1004     const SimdFloat CC3(-0.08670413896296343F);
1005     const SimdFloat CC2(-0.27124782687240334F);
1006     const SimdFloat CC1(-0.0007502488047806069F);
1007     const SimdFloat CC0(0.5642114853803148F);
1008     // Coefficients for expansion of exp(x) in [0,0.1]
1009     // CD0 and CD1 are both 1.0, so no need to declare them separately
1010     const SimdFloat CD2(0.5000066608081202F);
1011     const SimdFloat CD3(0.1664795422874624F);
1012     const SimdFloat CD4(0.04379839977652482F);
1013     const SimdFloat one(1.0F);
1014     const SimdFloat two(2.0F);
1015
1016     /* We need to use a small trick here, since we cannot assume all SIMD
1017      * architectures support integers, and the flag we want (0xfffff000) would
1018      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1019      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1020      * fp numbers, and perform a logical or. Since the expression is constant,
1021      * we can at least hope it is evaluated at compile-time.
1022      */
1023 #        if GMX_SIMD_HAVE_LOGICAL
1024     const SimdFloat sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
1025 #        else
1026     const int                         isieve = 0xFFFFF000;
1027     alignas(GMX_SIMD_ALIGNMENT) float mem[GMX_SIMD_FLOAT_WIDTH];
1028
1029     union {
1030         float f;
1031         int   i;
1032     } conv;
1033     int i;
1034 #        endif
1035
1036     SimdFloat x2, x4, y;
1037     SimdFloat q, z, t, t2, w, w2;
1038     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
1039     SimdFloat expmx2, corr;
1040     SimdFloat res_erf, res_erfc, res;
1041     SimdFBool m, msk_erf;
1042
1043     // Calculate erf()
1044     x2 = x * x;
1045     x4 = x2 * x2;
1046
1047     pA0 = fma(CA6, x4, CA4);
1048     pA1 = fma(CA5, x4, CA3);
1049     pA0 = fma(pA0, x4, CA2);
1050     pA1 = fma(pA1, x4, CA1);
1051     pA1 = pA1 * x2;
1052     pA0 = fma(pA0, x4, pA1);
1053     // Constant term must come last for precision reasons
1054     pA0 = pA0 + CA0;
1055
1056     res_erf = x * pA0;
1057
1058     // Calculate erfc
1059     y       = abs(x);
1060     msk_erf = SimdFloat(0.75F) <= y;
1061     t       = maskzInv(y, msk_erf);
1062     w       = t - one;
1063     t2      = t * t;
1064     w2      = w * w;
1065     /*
1066      * We cannot simply calculate exp(-y2) directly in single precision, since
1067      * that will lose a couple of bits of precision due to the multiplication.
1068      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1069      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1070      *
1071      * The only drawback with this is that it requires TWO separate exponential
1072      * evaluations, which would be horrible performance-wise. However, the argument
1073      * for the second exp() call is always small, so there we simply use a
1074      * low-order minimax expansion on [0,0.1].
1075      *
1076      * However, this neat idea requires support for logical ops (and) on
1077      * FP numbers, which some vendors decided isn't necessary in their SIMD
1078      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1079      * in double, but we still need memory as a backup when that is not available,
1080      * and this case is rare enough that we go directly there...
1081      */
1082 #        if GMX_SIMD_HAVE_LOGICAL
1083     z = y & sieve;
1084 #        else
1085     store(mem, y);
1086     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1087     {
1088         conv.f = mem[i];
1089         conv.i = conv.i & isieve;
1090         mem[i] = conv.f;
1091     }
1092     z = load<SimdFloat>(mem);
1093 #        endif
1094     q    = (z - y) * (z + y);
1095     corr = fma(CD4, q, CD3);
1096     corr = fma(corr, q, CD2);
1097     corr = fma(corr, q, one);
1098     corr = fma(corr, q, one);
1099
1100     expmx2 = exp(-z * z);
1101     expmx2 = expmx2 * corr;
1102
1103     pB1 = fma(CB9, w2, CB7);
1104     pB0 = fma(CB8, w2, CB6);
1105     pB1 = fma(pB1, w2, CB5);
1106     pB0 = fma(pB0, w2, CB4);
1107     pB1 = fma(pB1, w2, CB3);
1108     pB0 = fma(pB0, w2, CB2);
1109     pB1 = fma(pB1, w2, CB1);
1110     pB0 = fma(pB0, w2, CB0);
1111     pB0 = fma(pB1, w, pB0);
1112
1113     pC0 = fma(CC10, t2, CC8);
1114     pC1 = fma(CC9, t2, CC7);
1115     pC0 = fma(pC0, t2, CC6);
1116     pC1 = fma(pC1, t2, CC5);
1117     pC0 = fma(pC0, t2, CC4);
1118     pC1 = fma(pC1, t2, CC3);
1119     pC0 = fma(pC0, t2, CC2);
1120     pC1 = fma(pC1, t2, CC1);
1121
1122     pC0 = fma(pC0, t2, CC0);
1123     pC0 = fma(pC1, t, pC0);
1124     pC0 = pC0 * t;
1125
1126     // Select pB0 or pC0 for erfc()
1127     m        = two < y;
1128     res_erfc = blend(pB0, pC0, m);
1129     res_erfc = res_erfc * expmx2;
1130
1131     // erfc(x<0) = 2-erfc(|x|)
1132     m        = x < setZero();
1133     res_erfc = blend(res_erfc, two - res_erfc, m);
1134
1135     // Select erf() or erfc()
1136     res = blend(one - res_erf, res_erfc, msk_erf);
1137
1138     return res;
1139 }
1140
1141 /*! \brief SIMD float sin \& cos.
1142  *
1143  * \param x The argument to evaluate sin/cos for
1144  * \param[out] sinval Sin(x)
1145  * \param[out] cosval Cos(x)
1146  *
1147  * This version achieves close to machine precision, but for very large
1148  * magnitudes of the argument we inherently begin to lose accuracy due to the
1149  * argument reduction, despite using extended precision arithmetics internally.
1150  */
1151 static inline void gmx_simdcall sincos(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
1152 {
1153     // Constants to subtract Pi/4*x from y while minimizing precision loss
1154     const SimdFloat argred0(-1.5703125);
1155     const SimdFloat argred1(-4.83751296997070312500e-04F);
1156     const SimdFloat argred2(-7.54953362047672271729e-08F);
1157     const SimdFloat argred3(-2.56334406825708960298e-12F);
1158     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1159     const SimdFloat const_sin2(-1.9515295891e-4F);
1160     const SimdFloat const_sin1(8.3321608736e-3F);
1161     const SimdFloat const_sin0(-1.6666654611e-1F);
1162     const SimdFloat const_cos2(2.443315711809948e-5F);
1163     const SimdFloat const_cos1(-1.388731625493765e-3F);
1164     const SimdFloat const_cos0(4.166664568298827e-2F);
1165     const SimdFloat half(0.5F);
1166     const SimdFloat one(1.0F);
1167     SimdFloat       ssign, csign;
1168     SimdFloat       x2, y, z, psin, pcos, sss, ccc;
1169     SimdFBool       m;
1170
1171 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1172     const SimdFInt32 ione(1);
1173     const SimdFInt32 itwo(2);
1174     SimdFInt32       iy;
1175
1176     z  = x * two_over_pi;
1177     iy = cvtR2I(z);
1178     y  = round(z);
1179
1180     m     = cvtIB2B((iy & ione) == SimdFInt32(0));
1181     ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
1182     csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
1183 #        else
1184     const SimdFloat quarter(0.25f);
1185     const SimdFloat minusquarter(-0.25f);
1186     SimdFloat       q;
1187     SimdFBool       m1, m2, m3;
1188
1189     /* The most obvious way to find the arguments quadrant in the unit circle
1190      * to calculate the sign is to use integer arithmetic, but that is not
1191      * present in all SIMD implementations. As an alternative, we have devised a
1192      * pure floating-point algorithm that uses truncation for argument reduction
1193      * so that we get a new value 0<=q<1 over the unit circle, and then
1194      * do floating-point comparisons with fractions. This is likely to be
1195      * slightly slower (~10%) due to the longer latencies of floating-point, so
1196      * we only use it when integer SIMD arithmetic is not present.
1197      */
1198     ssign = x;
1199     x     = abs(x);
1200     // It is critical that half-way cases are rounded down
1201     z = fma(x, two_over_pi, half);
1202     y = trunc(z);
1203     q = z * quarter;
1204     q = q - trunc(q);
1205     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1206      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1207      * This removes the 2*Pi periodicity without using any integer arithmetic.
1208      * First check if y had the value 2 or 3, set csign if true.
1209      */
1210     q = q - half;
1211     /* If we have logical operations we can work directly on the signbit, which
1212      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1213      * Thus, if you are altering defines to debug alternative code paths, the
1214      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1215      * active or inactive - you will get errors if only one is used.
1216      */
1217 #            if GMX_SIMD_HAVE_LOGICAL
1218     ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
1219     csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
1220     ssign = ssign ^ csign;
1221 #            else
1222     ssign = copysign(SimdFloat(1.0f), ssign);
1223     csign = copysign(SimdFloat(1.0f), q);
1224     csign = -csign;
1225     ssign = ssign * csign; // swap ssign if csign was set.
1226 #            endif
1227     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1228     m1 = (q < minusquarter);
1229     m2 = (setZero() <= q);
1230     m3 = (q < quarter);
1231     m2 = m2 && m3;
1232     m  = m1 || m2;
1233     // where mask is FALSE, swap sign.
1234     csign   = csign * blend(SimdFloat(-1.0f), one, m);
1235 #        endif
1236     x  = fma(y, argred0, x);
1237     x  = fma(y, argred1, x);
1238     x  = fma(y, argred2, x);
1239     x  = fma(y, argred3, x);
1240     x2 = x * x;
1241
1242     psin = fma(const_sin2, x2, const_sin1);
1243     psin = fma(psin, x2, const_sin0);
1244     psin = fma(psin, x * x2, x);
1245     pcos = fma(const_cos2, x2, const_cos1);
1246     pcos = fma(pcos, x2, const_cos0);
1247     pcos = fms(pcos, x2, half);
1248     pcos = fma(pcos, x2, one);
1249
1250     sss = blend(pcos, psin, m);
1251     ccc = blend(psin, pcos, m);
1252     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1253 #        if GMX_SIMD_HAVE_LOGICAL
1254     *sinval = sss ^ ssign;
1255     *cosval = ccc ^ csign;
1256 #        else
1257     *sinval = sss * ssign;
1258     *cosval = ccc * csign;
1259 #        endif
1260 }
1261
1262 /*! \brief SIMD float sin(x).
1263  *
1264  * \param x The argument to evaluate sin for
1265  * \result Sin(x)
1266  *
1267  * \attention Do NOT call both sin & cos if you need both results, since each of them
1268  * will then call \ref sincos and waste a factor 2 in performance.
1269  */
1270 static inline SimdFloat gmx_simdcall sin(SimdFloat x)
1271 {
1272     SimdFloat s, c;
1273     sincos(x, &s, &c);
1274     return s;
1275 }
1276
1277 /*! \brief SIMD float cos(x).
1278  *
1279  * \param x The argument to evaluate cos for
1280  * \result Cos(x)
1281  *
1282  * \attention Do NOT call both sin & cos if you need both results, since each of them
1283  * will then call \ref sincos and waste a factor 2 in performance.
1284  */
1285 static inline SimdFloat gmx_simdcall cos(SimdFloat x)
1286 {
1287     SimdFloat s, c;
1288     sincos(x, &s, &c);
1289     return c;
1290 }
1291
1292 /*! \brief SIMD float tan(x).
1293  *
1294  * \param x The argument to evaluate tan for
1295  * \result Tan(x)
1296  */
1297 static inline SimdFloat gmx_simdcall tan(SimdFloat x)
1298 {
1299     const SimdFloat argred0(-1.5703125);
1300     const SimdFloat argred1(-4.83751296997070312500e-04F);
1301     const SimdFloat argred2(-7.54953362047672271729e-08F);
1302     const SimdFloat argred3(-2.56334406825708960298e-12F);
1303     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1304     const SimdFloat CT6(0.009498288995810566122993911);
1305     const SimdFloat CT5(0.002895755790837379295226923);
1306     const SimdFloat CT4(0.02460087336161924491836265);
1307     const SimdFloat CT3(0.05334912882656359828045988);
1308     const SimdFloat CT2(0.1333989091464957704418495);
1309     const SimdFloat CT1(0.3333307599244198227797507);
1310
1311     SimdFloat x2, p, y, z;
1312     SimdFBool m;
1313
1314 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1315     SimdFInt32 iy;
1316     SimdFInt32 ione(1);
1317
1318     z  = x * two_over_pi;
1319     iy = cvtR2I(z);
1320     y  = round(z);
1321     m  = cvtIB2B((iy & ione) == ione);
1322
1323     x = fma(y, argred0, x);
1324     x = fma(y, argred1, x);
1325     x = fma(y, argred2, x);
1326     x = fma(y, argred3, x);
1327     x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1328 #        else
1329     const SimdFloat quarter(0.25f);
1330     const SimdFloat half(0.5f);
1331     const SimdFloat threequarter(0.75f);
1332     SimdFloat       w, q;
1333     SimdFBool       m1, m2, m3;
1334
1335     w     = abs(x);
1336     z     = fma(w, two_over_pi, half);
1337     y     = trunc(z);
1338     q     = z * quarter;
1339     q     = q - trunc(q);
1340     m1    = quarter <= q;
1341     m2    = q < half;
1342     m3    = threequarter <= q;
1343     m1    = m1 && m2;
1344     m     = m1 || m3;
1345     w     = fma(y, argred0, w);
1346     w     = fma(y, argred1, w);
1347     w     = fma(y, argred2, w);
1348     w     = fma(y, argred3, w);
1349     w     = blend(w, -w, m);
1350     x     = w * copysign(SimdFloat(1.0), x);
1351 #        endif
1352     x2 = x * x;
1353     p  = fma(CT6, x2, CT5);
1354     p  = fma(p, x2, CT4);
1355     p  = fma(p, x2, CT3);
1356     p  = fma(p, x2, CT2);
1357     p  = fma(p, x2, CT1);
1358     p  = fma(x2 * p, x, x);
1359
1360     p = blend(p, maskzInv(p, m), m);
1361     return p;
1362 }
1363
1364 /*! \brief SIMD float asin(x).
1365  *
1366  * \param x The argument to evaluate asin for
1367  * \result Asin(x)
1368  */
1369 static inline SimdFloat gmx_simdcall asin(SimdFloat x)
1370 {
1371     const SimdFloat limitlow(1e-4F);
1372     const SimdFloat half(0.5F);
1373     const SimdFloat one(1.0F);
1374     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1375     const SimdFloat CC5(4.2163199048E-2F);
1376     const SimdFloat CC4(2.4181311049E-2F);
1377     const SimdFloat CC3(4.5470025998E-2F);
1378     const SimdFloat CC2(7.4953002686E-2F);
1379     const SimdFloat CC1(1.6666752422E-1F);
1380     SimdFloat       xabs;
1381     SimdFloat       z, z1, z2, q, q1, q2;
1382     SimdFloat       pA, pB;
1383     SimdFBool       m, m2;
1384
1385     xabs = abs(x);
1386     m    = half < xabs;
1387     z1   = half * (one - xabs);
1388     m2   = xabs < one;
1389     q1   = z1 * maskzInvsqrt(z1, m2);
1390     q2   = xabs;
1391     z2   = q2 * q2;
1392     z    = blend(z2, z1, m);
1393     q    = blend(q2, q1, m);
1394
1395     z2 = z * z;
1396     pA = fma(CC5, z2, CC3);
1397     pB = fma(CC4, z2, CC2);
1398     pA = fma(pA, z2, CC1);
1399     pA = pA * z;
1400     z  = fma(pB, z2, pA);
1401     z  = fma(z, q, q);
1402     q2 = halfpi - z;
1403     q2 = q2 - z;
1404     z  = blend(z, q2, m);
1405
1406     m = limitlow < xabs;
1407     z = blend(xabs, z, m);
1408     z = copysign(z, x);
1409
1410     return z;
1411 }
1412
1413 /*! \brief SIMD float acos(x).
1414  *
1415  * \param x The argument to evaluate acos for
1416  * \result Acos(x)
1417  */
1418 static inline SimdFloat gmx_simdcall acos(SimdFloat x)
1419 {
1420     const SimdFloat one(1.0F);
1421     const SimdFloat half(0.5F);
1422     const SimdFloat pi(static_cast<float>(M_PI));
1423     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1424     SimdFloat       xabs;
1425     SimdFloat       z, z1, z2, z3;
1426     SimdFBool       m1, m2, m3;
1427
1428     xabs = abs(x);
1429     m1   = half < xabs;
1430     m2   = setZero() < x;
1431
1432     z  = fnma(half, xabs, half);
1433     m3 = xabs < one;
1434     z  = z * maskzInvsqrt(z, m3);
1435     z  = blend(x, z, m1);
1436     z  = asin(z);
1437
1438     z2 = z + z;
1439     z1 = pi - z2;
1440     z3 = halfpi - z;
1441     z  = blend(z1, z2, m2);
1442     z  = blend(z3, z, m1);
1443
1444     return z;
1445 }
1446
1447 /*! \brief SIMD float asin(x).
1448  *
1449  * \param x The argument to evaluate atan for
1450  * \result Atan(x), same argument/value range as standard math library.
1451  */
1452 static inline SimdFloat gmx_simdcall atan(SimdFloat x)
1453 {
1454     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1455     const SimdFloat CA17(0.002823638962581753730774F);
1456     const SimdFloat CA15(-0.01595690287649631500244F);
1457     const SimdFloat CA13(0.04250498861074447631836F);
1458     const SimdFloat CA11(-0.07489009201526641845703F);
1459     const SimdFloat CA9(0.1063479334115982055664F);
1460     const SimdFloat CA7(-0.1420273631811141967773F);
1461     const SimdFloat CA5(0.1999269574880599975585F);
1462     const SimdFloat CA3(-0.3333310186862945556640F);
1463     const SimdFloat one(1.0F);
1464     SimdFloat       x2, x3, x4, pA, pB;
1465     SimdFBool       m, m2;
1466
1467     m  = x < setZero();
1468     x  = abs(x);
1469     m2 = one < x;
1470     x  = blend(x, maskzInv(x, m2), m2);
1471
1472     x2 = x * x;
1473     x3 = x2 * x;
1474     x4 = x2 * x2;
1475     pA = fma(CA17, x4, CA13);
1476     pB = fma(CA15, x4, CA11);
1477     pA = fma(pA, x4, CA9);
1478     pB = fma(pB, x4, CA7);
1479     pA = fma(pA, x4, CA5);
1480     pB = fma(pB, x4, CA3);
1481     pA = fma(pA, x2, pB);
1482     pA = fma(pA, x3, x);
1483
1484     pA = blend(pA, halfpi - pA, m2);
1485     pA = blend(pA, -pA, m);
1486
1487     return pA;
1488 }
1489
1490 /*! \brief SIMD float atan2(y,x).
1491  *
1492  * \param y Y component of vector, any quartile
1493  * \param x X component of vector, any quartile
1494  * \result Atan(y,x), same argument/value range as standard math library.
1495  *
1496  * \note This routine should provide correct results for all finite
1497  * non-zero or positive-zero arguments. However, negative zero arguments will
1498  * be treated as positive zero, which means the return value will deviate from
1499  * the standard math library atan2(y,x) for those cases. That should not be
1500  * of any concern in Gromacs, and in particular it will not affect calculations
1501  * of angles from vectors.
1502  */
1503 static inline SimdFloat gmx_simdcall atan2(SimdFloat y, SimdFloat x)
1504 {
1505     const SimdFloat pi(static_cast<float>(M_PI));
1506     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0));
1507     SimdFloat       xinv, p, aoffset;
1508     SimdFBool       mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1509
1510     mask_xnz  = x != setZero();
1511     mask_ynz  = y != setZero();
1512     mask_xlt0 = x < setZero();
1513     mask_ylt0 = y < setZero();
1514
1515     aoffset = selectByNotMask(halfpi, mask_xnz);
1516     aoffset = selectByMask(aoffset, mask_ynz);
1517
1518     aoffset = blend(aoffset, pi, mask_xlt0);
1519     aoffset = blend(aoffset, -aoffset, mask_ylt0);
1520
1521     xinv = maskzInv(x, mask_xnz);
1522     p    = y * xinv;
1523     p    = atan(p);
1524     p    = p + aoffset;
1525
1526     return p;
1527 }
1528
1529 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1530  *
1531  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1532  * \result Correction factor to coulomb force - see below for details.
1533  *
1534  * This routine is meant to enable analytical evaluation of the
1535  * direct-space PME electrostatic force to avoid tables.
1536  *
1537  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1538  * are some problems evaluating that:
1539  *
1540  * First, the error function is difficult (read: expensive) to
1541  * approxmiate accurately for intermediate to large arguments, and
1542  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1543  * Second, we now try to avoid calculating potentials in Gromacs but
1544  * use forces directly.
1545  *
1546  * We can simply things slight by noting that the PME part is really
1547  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1548  * \f[
1549  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1550  * \f]
1551  * The first term we already have from the inverse square root, so
1552  * that we can leave out of this routine.
1553  *
1554  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1555  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1556  * the range used for the minimax fit. Use your favorite plotting program
1557  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1558  *
1559  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1560  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1561  * then only use even powers. This is another minor optimization, since
1562  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1563  * the vector between the two atoms to get the vectorial force. The
1564  * fastest flops are the ones we can avoid calculating!
1565  *
1566  * So, here's how it should be used:
1567  *
1568  * 1. Calculate \f$r^2\f$.
1569  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1570  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1571  * 4. The return value is the expression:
1572  *
1573  * \f[
1574  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1575  * \f]
1576  *
1577  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1578  *
1579  *  \f[
1580  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1581  *  \f]
1582  *
1583  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1584  *
1585  *  \f[
1586  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1587  *  \f]
1588  *
1589  *    With a bit of math exercise you should be able to confirm that
1590  *    this is exactly
1591  *
1592  *  \f[
1593  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1594  *  \f]
1595  *
1596  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1597  *    and you have your force (divided by \f$r\f$). A final multiplication
1598  *    with the vector connecting the two particles and you have your
1599  *    vectorial force to add to the particles.
1600  *
1601  * This approximation achieves an error slightly lower than 1e-6
1602  * in single precision and 1e-11 in double precision
1603  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1604  * when added to \f$1/r\f$ the error will be insignificant.
1605  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1606  *
1607  */
1608 static inline SimdFloat gmx_simdcall pmeForceCorrection(SimdFloat z2)
1609 {
1610     const SimdFloat FN6(-1.7357322914161492954e-8F);
1611     const SimdFloat FN5(1.4703624142580877519e-6F);
1612     const SimdFloat FN4(-0.000053401640219807709149F);
1613     const SimdFloat FN3(0.0010054721316683106153F);
1614     const SimdFloat FN2(-0.019278317264888380590F);
1615     const SimdFloat FN1(0.069670166153766424023F);
1616     const SimdFloat FN0(-0.75225204789749321333F);
1617
1618     const SimdFloat FD4(0.0011193462567257629232F);
1619     const SimdFloat FD3(0.014866955030185295499F);
1620     const SimdFloat FD2(0.11583842382862377919F);
1621     const SimdFloat FD1(0.50736591960530292870F);
1622     const SimdFloat FD0(1.0F);
1623
1624     SimdFloat z4;
1625     SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1626
1627     z4 = z2 * z2;
1628
1629     polyFD0 = fma(FD4, z4, FD2);
1630     polyFD1 = fma(FD3, z4, FD1);
1631     polyFD0 = fma(polyFD0, z4, FD0);
1632     polyFD0 = fma(polyFD1, z2, polyFD0);
1633
1634     polyFD0 = inv(polyFD0);
1635
1636     polyFN0 = fma(FN6, z4, FN4);
1637     polyFN1 = fma(FN5, z4, FN3);
1638     polyFN0 = fma(polyFN0, z4, FN2);
1639     polyFN1 = fma(polyFN1, z4, FN1);
1640     polyFN0 = fma(polyFN0, z4, FN0);
1641     polyFN0 = fma(polyFN1, z2, polyFN0);
1642
1643     return polyFN0 * polyFD0;
1644 }
1645
1646
1647 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1648  *
1649  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1650  * \result Correction factor to coulomb potential - see below for details.
1651  *
1652  * See \ref pmeForceCorrection for details about the approximation.
1653  *
1654  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1655  * as the input argument.
1656  *
1657  * Here's how it should be used:
1658  *
1659  * 1. Calculate \f$r^2\f$.
1660  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1661  * 3. Evaluate this routine with z^2 as the argument.
1662  * 4. The return value is the expression:
1663  *
1664  *  \f[
1665  *   \frac{\mbox{erf}(z)}{z}
1666  *  \f]
1667  *
1668  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1669  *
1670  *  \f[
1671  *    \frac{\mbox{erf}(r \beta)}{r}
1672  *  \f]
1673  *
1674  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1675  *    and you have your potential.
1676  *
1677  * This approximation achieves an error slightly lower than 1e-6
1678  * in single precision and 4e-11 in double precision
1679  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1680  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1681  * when added to \f$1/r\f$ the error will be insignificant.
1682  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1683  */
1684 static inline SimdFloat gmx_simdcall pmePotentialCorrection(SimdFloat z2)
1685 {
1686     const SimdFloat VN6(1.9296833005951166339e-8F);
1687     const SimdFloat VN5(-1.4213390571557850962e-6F);
1688     const SimdFloat VN4(0.000041603292906656984871F);
1689     const SimdFloat VN3(-0.00013134036773265025626F);
1690     const SimdFloat VN2(0.038657983986041781264F);
1691     const SimdFloat VN1(0.11285044772717598220F);
1692     const SimdFloat VN0(1.1283802385263030286F);
1693
1694     const SimdFloat VD3(0.0066752224023576045451F);
1695     const SimdFloat VD2(0.078647795836373922256F);
1696     const SimdFloat VD1(0.43336185284710920150F);
1697     const SimdFloat VD0(1.0F);
1698
1699     SimdFloat z4;
1700     SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1701
1702     z4 = z2 * z2;
1703
1704     polyVD1 = fma(VD3, z4, VD1);
1705     polyVD0 = fma(VD2, z4, VD0);
1706     polyVD0 = fma(polyVD1, z2, polyVD0);
1707
1708     polyVD0 = inv(polyVD0);
1709
1710     polyVN0 = fma(VN6, z4, VN4);
1711     polyVN1 = fma(VN5, z4, VN3);
1712     polyVN0 = fma(polyVN0, z4, VN2);
1713     polyVN1 = fma(polyVN1, z4, VN1);
1714     polyVN0 = fma(polyVN0, z4, VN0);
1715     polyVN0 = fma(polyVN1, z2, polyVN0);
1716
1717     return polyVN0 * polyVD0;
1718 }
1719 #    endif
1720
1721 /*! \} */
1722
1723 #    if GMX_SIMD_HAVE_DOUBLE
1724
1725
1726 /*! \name Double precision SIMD math functions
1727  *
1728  *  \note In most cases you should use the real-precision functions instead.
1729  *  \{
1730  */
1731
1732 /****************************************
1733  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1734  ****************************************/
1735
1736 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1737 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1738  *
1739  * \param x Values to set sign for
1740  * \param y Values used to set sign
1741  * \return  Magnitude of x, sign of y
1742  */
1743 static inline SimdDouble gmx_simdcall copysign(SimdDouble x, SimdDouble y)
1744 {
1745 #            if GMX_SIMD_HAVE_LOGICAL
1746     return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1747 #            else
1748     return blend(abs(x), -abs(x), (y < setZero()));
1749 #            endif
1750 }
1751 #        endif
1752
1753 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1754 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1755  *
1756  * This is a low-level routine that should only be used by SIMD math routine
1757  * that evaluates the inverse square root.
1758  *
1759  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1760  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
1761  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1762  */
1763 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
1764 {
1765     SimdDouble tmp1 = x * lu;
1766     SimdDouble tmp2 = SimdDouble(-0.5) * lu;
1767     tmp1            = fma(tmp1, lu, SimdDouble(-3.0));
1768     return tmp1 * tmp2;
1769 }
1770 #        endif
1771
1772 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1773  *
1774  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1775  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1776  *           For the single precision implementation this is obviously always
1777  *           true for positive values, but for double precision it adds an
1778  *           extra restriction since the first lookup step might have to be
1779  *           performed in single precision on some architectures. Note that the
1780  *           responsibility for checking falls on you - this routine does not
1781  *           check arguments.
1782  *
1783  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
1784  */
1785 static inline SimdDouble gmx_simdcall invsqrt(SimdDouble x)
1786 {
1787     SimdDouble lu = rsqrt(x);
1788 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1789     lu = rsqrtIter(lu, x);
1790 #        endif
1791 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1792     lu = rsqrtIter(lu, x);
1793 #        endif
1794 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1795     lu = rsqrtIter(lu, x);
1796 #        endif
1797 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1798     lu = rsqrtIter(lu, x);
1799 #        endif
1800     return lu;
1801 }
1802
1803 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1804  *
1805  * \param x0  First set of arguments, x0 must be in single range (see below).
1806  * \param x1  Second set of arguments, x1 must be in single range (see below).
1807  * \param[out] out0  Result 1/sqrt(x0)
1808  * \param[out] out1  Result 1/sqrt(x1)
1809  *
1810  *  In particular for double precision we can sometimes calculate square root
1811  *  pairs slightly faster by using single precision until the very last step.
1812  *
1813  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1814  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
1815  *       For the single precision implementation this is obviously always
1816  *       true for positive values, but for double precision it adds an
1817  *       extra restriction since the first lookup step might have to be
1818  *       performed in single precision on some architectures. Note that the
1819  *       responsibility for checking falls on you - this routine does not
1820  *       check arguments.
1821  */
1822 static inline void gmx_simdcall invsqrtPair(SimdDouble x0, SimdDouble x1, SimdDouble* out0, SimdDouble* out1)
1823 {
1824 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1825                 && (GMX_SIMD_RSQRT_BITS < 22)
1826     SimdFloat  xf  = cvtDD2F(x0, x1);
1827     SimdFloat  luf = rsqrt(xf);
1828     SimdDouble lu0, lu1;
1829     // Intermediate target is single - mantissa+1 bits
1830 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1831     luf = rsqrtIter(luf, xf);
1832 #            endif
1833 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1834     luf = rsqrtIter(luf, xf);
1835 #            endif
1836 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1837     luf = rsqrtIter(luf, xf);
1838 #            endif
1839     cvtF2DD(luf, &lu0, &lu1);
1840     // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1841 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1842     lu0 = rsqrtIter(lu0, x0);
1843     lu1 = rsqrtIter(lu1, x1);
1844 #            endif
1845 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1846     lu0 = rsqrtIter(lu0, x0);
1847     lu1 = rsqrtIter(lu1, x1);
1848 #            endif
1849     *out0 = lu0;
1850     *out1 = lu1;
1851 #        else
1852     *out0 = invsqrt(x0);
1853     *out1 = invsqrt(x1);
1854 #        endif
1855 }
1856
1857 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1858 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1859  *
1860  * This is a low-level routine that should only be used by SIMD math routine
1861  * that evaluates the reciprocal.
1862  *
1863  *  \param lu Approximation of 1/x, typically obtained from lookup.
1864  *  \param x  The reference (starting) value x for which we want 1/x.
1865  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1866  */
1867 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
1868 {
1869     return lu * fnma(lu, x, SimdDouble(2.0));
1870 }
1871 #        endif
1872
1873 /*! \brief Calculate 1/x for SIMD double.
1874  *
1875  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1876  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1877  *           For the single precision implementation this is obviously always
1878  *           true for positive values, but for double precision it adds an
1879  *           extra restriction since the first lookup step might have to be
1880  *           performed in single precision on some architectures. Note that the
1881  *           responsibility for checking falls on you - this routine does not
1882  *           check arguments.
1883  *
1884  *  \return 1/x. Result is undefined if your argument was invalid.
1885  */
1886 static inline SimdDouble gmx_simdcall inv(SimdDouble x)
1887 {
1888     SimdDouble lu = rcp(x);
1889 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1890     lu = rcpIter(lu, x);
1891 #        endif
1892 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1893     lu = rcpIter(lu, x);
1894 #        endif
1895 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1896     lu = rcpIter(lu, x);
1897 #        endif
1898 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1899     lu = rcpIter(lu, x);
1900 #        endif
1901     return lu;
1902 }
1903
1904 /*! \brief Division for SIMD doubles
1905  *
1906  * \param nom    Nominator
1907  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1908  *               For single precision this is equivalent to a nonzero argument,
1909  *               but in double precision it adds an extra restriction since
1910  *               the first lookup step might have to be performed in single
1911  *               precision on some architectures. Note that the responsibility
1912  *               for checking falls on you - this routine does not check arguments.
1913  *
1914  * \return nom/denom
1915  *
1916  * \note This function does not use any masking to avoid problems with
1917  *       zero values in the denominator.
1918  */
1919 static inline SimdDouble gmx_simdcall operator/(SimdDouble nom, SimdDouble denom)
1920 {
1921     return nom * inv(denom);
1922 }
1923
1924
1925 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1926  *
1927  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1928  *  Illegal values in the masked-out elements will not lead to
1929  *  floating-point exceptions.
1930  *
1931  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1932  *           GMX_FLOAT_MAX for masked-in entries.
1933  *           See \ref invsqrt for the discussion about argument restrictions.
1934  *  \param m Mask
1935  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
1936  *          entry was not masked, and 0.0 for masked-out entries.
1937  */
1938 static inline SimdDouble maskzInvsqrt(SimdDouble x, SimdDBool m)
1939 {
1940     SimdDouble lu = maskzRsqrt(x, m);
1941 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1942     lu = rsqrtIter(lu, x);
1943 #        endif
1944 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1945     lu = rsqrtIter(lu, x);
1946 #        endif
1947 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1948     lu = rsqrtIter(lu, x);
1949 #        endif
1950 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1951     lu = rsqrtIter(lu, x);
1952 #        endif
1953     return lu;
1954 }
1955
1956 /*! \brief Calculate 1/x for SIMD double, masked version.
1957  *
1958  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1959  *           GMX_FLOAT_MAX for masked-in entries.
1960  *           See \ref invsqrt for the discussion about argument restrictions.
1961  *  \param m Mask
1962  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1963  */
1964 static inline SimdDouble gmx_simdcall maskzInv(SimdDouble x, SimdDBool m)
1965 {
1966     SimdDouble lu = maskzRcp(x, m);
1967 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1968     lu = rcpIter(lu, x);
1969 #        endif
1970 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1971     lu = rcpIter(lu, x);
1972 #        endif
1973 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1974     lu = rcpIter(lu, x);
1975 #        endif
1976 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1977     lu = rcpIter(lu, x);
1978 #        endif
1979     return lu;
1980 }
1981
1982
1983 /*! \brief Calculate sqrt(x) for SIMD doubles.
1984  *
1985  *  \copydetails sqrt(SimdFloat)
1986  */
1987 template<MathOptimization opt = MathOptimization::Safe>
1988 static inline SimdDouble gmx_simdcall sqrt(SimdDouble x)
1989 {
1990     if (opt == MathOptimization::Safe)
1991     {
1992         // As we might use a float version of rsqrt, we mask out small values
1993         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1994         return res * x;
1995     }
1996     else
1997     {
1998         return x * invsqrt(x);
1999     }
2000 }
2001
2002 /*! \brief Cube root for SIMD doubles
2003  *
2004  * \param x      Argument to calculate cube root of. Can be negative or zero,
2005  *               but NaN or Inf values are not supported. Denormal values will
2006  *               be treated as 0.0.
2007  * \return       Cube root of x.
2008  */
2009 static inline SimdDouble gmx_simdcall cbrt(SimdDouble x)
2010 {
2011     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2012     const SimdDouble minDouble(std::numeric_limits<double>::min());
2013     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2014     // To avoid clang warnings about fragile integer division mixed with FP, we let
2015     // the divided value (1023/3=341) be the original constant.
2016     const std::int32_t offsetDiv3(341);
2017     const SimdDouble   c6(-0.145263899385486377);
2018     const SimdDouble   c5(0.784932344976639262);
2019     const SimdDouble   c4(-1.83469277483613086);
2020     const SimdDouble   c3(2.44693122563534430);
2021     const SimdDouble   c2(-2.11499494167371287);
2022     const SimdDouble   c1(1.50819193781584896);
2023     const SimdDouble   c0(0.354895765043919860);
2024     const SimdDouble   one(1.0);
2025     const SimdDouble   two(2.0);
2026     const SimdDouble   three(3.0);
2027     const SimdDouble   oneThird(1.0 / 3.0);
2028     const SimdDouble   cbrt2(1.2599210498948731648);
2029     const SimdDouble   sqrCbrt2(1.5874010519681994748);
2030
2031     // See the single precision routines for documentation of the algorithm
2032
2033     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2034     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
2035     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
2036
2037     SimdDInt32 exponent;
2038     SimdDouble y        = frexp(xAbs, &exponent);
2039     SimdDouble z        = fma(y, c6, c5);
2040     z                   = fma(z, y, c4);
2041     z                   = fma(z, y, c3);
2042     z                   = fma(z, y, c2);
2043     z                   = fma(z, y, c1);
2044     z                   = fma(z, y, c0);
2045     SimdDouble w        = z * z * z;
2046     SimdDouble nom      = z * fma(two, y, w);
2047     SimdDouble invDenom = inv(fma(two, w, y));
2048
2049     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2050     SimdDouble offsetExpDiv3 =
2051             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2052     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
2053     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
2054     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
2055     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
2056     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2057     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
2058     return result;
2059 }
2060
2061 /*! \brief Inverse cube root for SIMD doubles.
2062  *
2063  * \param x      Argument to calculate cube root of. Can be positive or
2064  *               negative, but the magnitude cannot be lower than
2065  *               the smallest normal number.
2066  * \return       Cube root of x. Undefined for values that don't
2067  *               fulfill the restriction of abs(x) > minDouble.
2068  */
2069 static inline SimdDouble gmx_simdcall invcbrt(SimdDouble x)
2070 {
2071     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2072     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2073     // To avoid clang warnings about fragile integer division mixed with FP, we let
2074     // the divided value (1023/3=341) be the original constant.
2075     const std::int32_t offsetDiv3(341);
2076     const SimdDouble   c6(-0.145263899385486377);
2077     const SimdDouble   c5(0.784932344976639262);
2078     const SimdDouble   c4(-1.83469277483613086);
2079     const SimdDouble   c3(2.44693122563534430);
2080     const SimdDouble   c2(-2.11499494167371287);
2081     const SimdDouble   c1(1.50819193781584896);
2082     const SimdDouble   c0(0.354895765043919860);
2083     const SimdDouble   one(1.0);
2084     const SimdDouble   two(2.0);
2085     const SimdDouble   three(3.0);
2086     const SimdDouble   oneThird(1.0 / 3.0);
2087     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
2088     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
2089
2090     // See the single precision routines for documentation of the algorithm
2091
2092     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2093     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2094
2095     SimdDInt32 exponent;
2096     SimdDouble y         = frexp(xAbs, &exponent);
2097     SimdDouble z         = fma(y, c6, c5);
2098     z                    = fma(z, y, c4);
2099     z                    = fma(z, y, c3);
2100     z                    = fma(z, y, c2);
2101     z                    = fma(z, y, c1);
2102     z                    = fma(z, y, c0);
2103     SimdDouble w         = z * z * z;
2104     SimdDouble nom       = fma(two, w, y);
2105     SimdDouble invDenom  = inv(z * fma(two, y, w));
2106     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2107     SimdDouble offsetExpDiv3 =
2108             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2109     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
2110     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
2111     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
2112     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
2113     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2114     SimdDouble result    = ldexp(fraction, expDiv3);
2115     return result;
2116 }
2117
2118 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2119  *
2120  * \param x Argument, should be >0.
2121  * \result The base-2 logarithm of x. Undefined if argument is invalid.
2122  */
2123 static inline SimdDouble gmx_simdcall log2(SimdDouble x)
2124 {
2125 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2126     // Just rescale if native log2() is not present, but log is.
2127     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
2128 #        else
2129     const SimdDouble one(1.0);
2130     const SimdDouble two(2.0);
2131     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2132     const SimdDouble CL15(0.2138031565795550370534528);
2133     const SimdDouble CL13(0.2208884091496370882801159);
2134     const SimdDouble CL11(0.2623358279761824340958754);
2135     const SimdDouble CL9(0.3205984930182496084327681);
2136     const SimdDouble CL7(0.4121985864521960363227038);
2137     const SimdDouble CL5(0.5770780163410746954610886);
2138     const SimdDouble CL3(0.9617966939260027547931031);
2139     const SimdDouble CL1(2.885390081777926774009302);
2140     SimdDouble       fExp, x2, p;
2141     SimdDBool        m;
2142     SimdDInt32       iExp;
2143
2144     x    = frexp(x, &iExp);
2145     fExp = cvtI2R(iExp);
2146
2147     m = x < invsqrt2;
2148     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2149     fExp = fExp - selectByMask(one, m);
2150     x    = x * blend(one, two, m);
2151
2152     x  = (x - one) * inv(x + one);
2153     x2 = x * x;
2154
2155     p = fma(CL15, x2, CL13);
2156     p = fma(p, x2, CL11);
2157     p = fma(p, x2, CL9);
2158     p = fma(p, x2, CL7);
2159     p = fma(p, x2, CL5);
2160     p = fma(p, x2, CL3);
2161     p = fma(p, x2, CL1);
2162     p = fma(p, x, fExp);
2163
2164     return p;
2165 #        endif
2166 }
2167
2168 #        if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2169 /*! \brief SIMD double log(x). This is the natural logarithm.
2170  *
2171  * \param x Argument, should be >0.
2172  * \result The natural logarithm of x. Undefined if argument is invalid.
2173  */
2174 static inline SimdDouble gmx_simdcall log(SimdDouble x)
2175 {
2176     const SimdDouble one(1.0);
2177     const SimdDouble two(2.0);
2178     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2179     const SimdDouble corr(0.693147180559945286226764);
2180     const SimdDouble CL15(0.148197055177935105296783);
2181     const SimdDouble CL13(0.153108178020442575739679);
2182     const SimdDouble CL11(0.181837339521549679055568);
2183     const SimdDouble CL9(0.22222194152736701733275);
2184     const SimdDouble CL7(0.285714288030134544449368);
2185     const SimdDouble CL5(0.399999999989941956712869);
2186     const SimdDouble CL3(0.666666666666685503450651);
2187     const SimdDouble CL1(2.0);
2188     SimdDouble       fExp, x2, p;
2189     SimdDBool        m;
2190     SimdDInt32       iExp;
2191
2192     x    = frexp(x, &iExp);
2193     fExp = cvtI2R(iExp);
2194
2195     m = x < invsqrt2;
2196     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2197     fExp = fExp - selectByMask(one, m);
2198     x    = x * blend(one, two, m);
2199
2200     x  = (x - one) * inv(x + one);
2201     x2 = x * x;
2202
2203     p = fma(CL15, x2, CL13);
2204     p = fma(p, x2, CL11);
2205     p = fma(p, x2, CL9);
2206     p = fma(p, x2, CL7);
2207     p = fma(p, x2, CL5);
2208     p = fma(p, x2, CL3);
2209     p = fma(p, x2, CL1);
2210     p = fma(p, x, corr * fExp);
2211
2212     return p;
2213 }
2214 #        endif
2215
2216 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2217 /*! \brief SIMD double 2^x.
2218  *
2219  * \copydetails exp2(SimdFloat)
2220  */
2221 template<MathOptimization opt = MathOptimization::Safe>
2222 static inline SimdDouble gmx_simdcall exp2(SimdDouble x)
2223 {
2224     const SimdDouble CE11(4.435280790452730022081181e-10);
2225     const SimdDouble CE10(7.074105630863314448024247e-09);
2226     const SimdDouble CE9(1.017819803432096698472621e-07);
2227     const SimdDouble CE8(1.321543308956718799557863e-06);
2228     const SimdDouble CE7(0.00001525273348995851746990884);
2229     const SimdDouble CE6(0.0001540353046251466849082632);
2230     const SimdDouble CE5(0.001333355814678995257307880);
2231     const SimdDouble CE4(0.009618129107588335039176502);
2232     const SimdDouble CE3(0.05550410866481992147457793);
2233     const SimdDouble CE2(0.2402265069591015620470894);
2234     const SimdDouble CE1(0.6931471805599453304615075);
2235     const SimdDouble one(1.0);
2236
2237     SimdDouble intpart;
2238     SimdDouble fexppart;
2239     SimdDouble p;
2240
2241     // Large negative values are valid arguments to exp2(), so there are two
2242     // things we need to account for:
2243     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2244     //    zero and we can no longer multiply with it. There are special IEEE
2245     //    formats to handle this range, but for now we have to accept that
2246     //    we cannot handle those arguments. If input value becomes even more
2247     //    negative, it will start to loop and we would end up with invalid
2248     //    exponents. Thus, we need to limit or mask this.
2249     // 2. For VERY large negative values, we will have problems that the
2250     //    subtraction to get the fractional part loses accuracy, and then we
2251     //    can end up with overflows in the polynomial.
2252     //
2253     // For now, we handle this by forwarding the math optimization setting to
2254     // ldexp, where the routine will return zero for very small arguments.
2255     //
2256     // However, before doing that we need to make sure we do not call cvtR2I
2257     // with an argument that is so negative it cannot be converted to an integer.
2258     if (opt == MathOptimization::Safe)
2259     {
2260         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
2261     }
2262
2263     fexppart = ldexp<opt>(one, cvtR2I(x));
2264     intpart  = round(x);
2265     x        = x - intpart;
2266
2267     p = fma(CE11, x, CE10);
2268     p = fma(p, x, CE9);
2269     p = fma(p, x, CE8);
2270     p = fma(p, x, CE7);
2271     p = fma(p, x, CE6);
2272     p = fma(p, x, CE5);
2273     p = fma(p, x, CE4);
2274     p = fma(p, x, CE3);
2275     p = fma(p, x, CE2);
2276     p = fma(p, x, CE1);
2277     p = fma(p, x, one);
2278     x = p * fexppart;
2279     return x;
2280 }
2281 #        endif
2282
2283 #        if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2284 /*! \brief SIMD double exp(x).
2285  *
2286  * \copydetails exp(SimdFloat)
2287  */
2288 template<MathOptimization opt = MathOptimization::Safe>
2289 static inline SimdDouble gmx_simdcall exp(SimdDouble x)
2290 {
2291     const SimdDouble argscale(1.44269504088896340735992468100);
2292     const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
2293     const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2294     const SimdDouble CE12(2.078375306791423699350304e-09);
2295     const SimdDouble CE11(2.518173854179933105218635e-08);
2296     const SimdDouble CE10(2.755842049600488770111608e-07);
2297     const SimdDouble CE9(2.755691815216689746619849e-06);
2298     const SimdDouble CE8(2.480158383706245033920920e-05);
2299     const SimdDouble CE7(0.0001984127043518048611841321);
2300     const SimdDouble CE6(0.001388888889360258341755930);
2301     const SimdDouble CE5(0.008333333332907368102819109);
2302     const SimdDouble CE4(0.04166666666663836745814631);
2303     const SimdDouble CE3(0.1666666666666796929434570);
2304     const SimdDouble CE2(0.5);
2305     const SimdDouble one(1.0);
2306     SimdDouble       fexppart;
2307     SimdDouble       intpart;
2308     SimdDouble       y, p;
2309
2310     // Large negative values are valid arguments to exp2(), so there are two
2311     // things we need to account for:
2312     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2313     //    zero and we can no longer multiply with it. There are special IEEE
2314     //    formats to handle this range, but for now we have to accept that
2315     //    we cannot handle those arguments. If input value becomes even more
2316     //    negative, it will start to loop and we would end up with invalid
2317     //    exponents. Thus, we need to limit or mask this.
2318     // 2. For VERY large negative values, we will have problems that the
2319     //    subtraction to get the fractional part loses accuracy, and then we
2320     //    can end up with overflows in the polynomial.
2321     //
2322     // For now, we handle this by forwarding the math optimization setting to
2323     // ldexp, where the routine will return zero for very small arguments.
2324     //
2325     // However, before doing that we need to make sure we do not call cvtR2I
2326     // with an argument that is so negative it cannot be converted to an integer
2327     // after being multiplied by argscale.
2328
2329     if (opt == MathOptimization::Safe)
2330     {
2331         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
2332     }
2333
2334     y = x * argscale;
2335
2336     fexppart = ldexp<opt>(one, cvtR2I(y));
2337     intpart  = round(y);
2338
2339     // Extended precision arithmetics
2340     x = fma(invargscale0, intpart, x);
2341     x = fma(invargscale1, intpart, x);
2342
2343     p = fma(CE12, x, CE11);
2344     p = fma(p, x, CE10);
2345     p = fma(p, x, CE9);
2346     p = fma(p, x, CE8);
2347     p = fma(p, x, CE7);
2348     p = fma(p, x, CE6);
2349     p = fma(p, x, CE5);
2350     p = fma(p, x, CE4);
2351     p = fma(p, x, CE3);
2352     p = fma(p, x, CE2);
2353     p = fma(p, x * x, x);
2354 #            if GMX_SIMD_HAVE_FMA
2355     x = fma(p, fexppart, fexppart);
2356 #            else
2357     x = (p + one) * fexppart;
2358 #            endif
2359
2360     return x;
2361 }
2362 #        endif
2363
2364 /*! \brief SIMD double pow(x,y)
2365  *
2366  * This returns x^y for SIMD values.
2367  *
2368  * \tparam opt If this is changed from the default (safe) into the unsafe
2369  *             option, there are no guarantees about correct results for x==0.
2370  *
2371  * \param x Base.
2372  *
2373  * \param y exponent.
2374  *
2375  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2376  *         depending on the underlying implementation. If unsafe optimizations
2377  *         are enabled, this is also true for x==0.
2378  *
2379  * \warning You cannot rely on this implementation returning inf for arguments
2380  *          that cause overflow. If you have some very large
2381  *          values and need to rely on getting a valid numerical output,
2382  *          take the minimum of your variable and the largest valid argument
2383  *          before calling this routine.
2384  */
2385 template<MathOptimization opt = MathOptimization::Safe>
2386 static inline SimdDouble gmx_simdcall pow(SimdDouble x, SimdDouble y)
2387 {
2388     SimdDouble xcorr;
2389
2390     if (opt == MathOptimization::Safe)
2391     {
2392         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
2393     }
2394     else
2395     {
2396         xcorr = x;
2397     }
2398
2399     SimdDouble result = exp2<opt>(y * log2(xcorr));
2400
2401     if (opt == MathOptimization::Safe)
2402     {
2403         // if x==0 and y>0 we explicitly set the result to 0.0
2404         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2405         result = blend(result, setZero(), x == setZero() && setZero() < y);
2406     }
2407
2408     return result;
2409 }
2410
2411
2412 /*! \brief SIMD double erf(x).
2413  *
2414  * \param x The value to calculate erf(x) for.
2415  * \result erf(x)
2416  *
2417  * This routine achieves very close to full precision, but we do not care about
2418  * the last bit or the subnormal result range.
2419  */
2420 static inline SimdDouble gmx_simdcall erf(SimdDouble x)
2421 {
2422     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2423     const SimdDouble CAP4(-0.431780540597889301512e-4);
2424     const SimdDouble CAP3(-0.00578562306260059236059);
2425     const SimdDouble CAP2(-0.028593586920219752446);
2426     const SimdDouble CAP1(-0.315924962948621698209);
2427     const SimdDouble CAP0(0.14952975608477029151);
2428
2429     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2430     const SimdDouble CAQ4(0.00015126584532155383535);
2431     const SimdDouble CAQ3(0.00536692680669480725423);
2432     const SimdDouble CAQ2(0.0668686825594046122636);
2433     const SimdDouble CAQ1(0.402604990869284362773);
2434     // CAQ0 == 1.0
2435     const SimdDouble CAoffset(0.9788494110107421875);
2436
2437     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2438     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2439     const SimdDouble CBP5(0.00119770193298159629350136085658);
2440     const SimdDouble CBP4(0.0164944422378370965881008942733);
2441     const SimdDouble CBP3(0.0984581468691775932063932439252);
2442     const SimdDouble CBP2(0.317364595806937763843589437418);
2443     const SimdDouble CBP1(0.554167062641455850932670067075);
2444     const SimdDouble CBP0(0.427583576155807163756925301060);
2445     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2446     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2447     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2448     const SimdDouble CBQ4(0.955364736493055670530981883072);
2449     const SimdDouble CBQ3(2.36815675631420037315349279199);
2450     const SimdDouble CBQ2(3.55261649184083035537184223542);
2451     const SimdDouble CBQ1(2.93501136050160872574376997993);
2452     // CBQ0 == 1.0
2453
2454     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2455     const SimdDouble CCP6(-2.8175401114513378771);
2456     const SimdDouble CCP5(-3.22729451764143718517);
2457     const SimdDouble CCP4(-2.5518551727311523996);
2458     const SimdDouble CCP3(-0.687717681153649930619);
2459     const SimdDouble CCP2(-0.212652252872804219852);
2460     const SimdDouble CCP1(0.0175389834052493308818);
2461     const SimdDouble CCP0(0.00628057170626964891937);
2462
2463     const SimdDouble CCQ6(5.48409182238641741584);
2464     const SimdDouble CCQ5(13.5064170191802889145);
2465     const SimdDouble CCQ4(22.9367376522880577224);
2466     const SimdDouble CCQ3(15.930646027911794143);
2467     const SimdDouble CCQ2(11.0567237927800161565);
2468     const SimdDouble CCQ1(2.79257750980575282228);
2469     // CCQ0 == 1.0
2470     const SimdDouble CCoffset(0.5579090118408203125);
2471
2472     const SimdDouble one(1.0);
2473     const SimdDouble two(2.0);
2474     const SimdDouble minFloat(std::numeric_limits<float>::min());
2475
2476     SimdDouble xabs, x2, x4, t, t2, w, w2;
2477     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2478     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2479     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2480     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2481     SimdDouble expmx2;
2482     SimdDBool  mask, mask_erf, notmask_erf;
2483
2484     // Calculate erf()
2485     xabs        = abs(x);
2486     mask_erf    = (xabs < one);
2487     notmask_erf = (one <= xabs);
2488     x2          = x * x;
2489     x4          = x2 * x2;
2490
2491     PolyAP0 = fma(CAP4, x4, CAP2);
2492     PolyAP1 = fma(CAP3, x4, CAP1);
2493     PolyAP0 = fma(PolyAP0, x4, CAP0);
2494     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2495
2496     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2497     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2498     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2499     PolyAQ0 = fma(PolyAQ0, x4, one);
2500     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2501
2502     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2503     res_erf = CAoffset + res_erf;
2504     res_erf = x * res_erf;
2505
2506     // Calculate erfc() in range [1,4.5]
2507     t  = xabs - one;
2508     t2 = t * t;
2509
2510     PolyBP0 = fma(CBP6, t2, CBP4);
2511     PolyBP1 = fma(CBP5, t2, CBP3);
2512     PolyBP0 = fma(PolyBP0, t2, CBP2);
2513     PolyBP1 = fma(PolyBP1, t2, CBP1);
2514     PolyBP0 = fma(PolyBP0, t2, CBP0);
2515     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2516
2517     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2518     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2519     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2520     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2521     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2522     PolyBQ0 = fma(PolyBQ0, t2, one);
2523     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2524
2525     // The denominator polynomial can be zero outside the range
2526     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2527
2528     res_erfcB = res_erfcB * xabs;
2529
2530     // Calculate erfc() in range [4.5,inf]
2531     w  = maskzInv(xabs, notmask_erf && (minFloat <= xabs));
2532     w2 = w * w;
2533
2534     PolyCP0 = fma(CCP6, w2, CCP4);
2535     PolyCP1 = fma(CCP5, w2, CCP3);
2536     PolyCP0 = fma(PolyCP0, w2, CCP2);
2537     PolyCP1 = fma(PolyCP1, w2, CCP1);
2538     PolyCP0 = fma(PolyCP0, w2, CCP0);
2539     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2540
2541     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2542     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2543     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2544     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2545     PolyCQ0 = fma(PolyCQ0, w2, one);
2546     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2547
2548     expmx2 = exp(-x2);
2549
2550     // The denominator polynomial can be zero outside the range
2551     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2552     res_erfcC = res_erfcC + CCoffset;
2553     res_erfcC = res_erfcC * w;
2554
2555     mask     = (SimdDouble(4.5) < xabs);
2556     res_erfc = blend(res_erfcB, res_erfcC, mask);
2557
2558     res_erfc = res_erfc * expmx2;
2559
2560     // erfc(x<0) = 2-erfc(|x|)
2561     mask     = (x < setZero());
2562     res_erfc = blend(res_erfc, two - res_erfc, mask);
2563
2564     // Select erf() or erfc()
2565     res = blend(one - res_erfc, res_erf, mask_erf);
2566
2567     return res;
2568 }
2569
2570 /*! \brief SIMD double erfc(x).
2571  *
2572  * \param x The value to calculate erfc(x) for.
2573  * \result erfc(x)
2574  *
2575  * This routine achieves full precision (bar the last bit) over most of the
2576  * input range, but for large arguments where the result is getting close
2577  * to the minimum representable numbers we accept slightly larger errors
2578  * (think results that are in the ballpark of 10^-200 for double)
2579  * since that is not relevant for MD.
2580  */
2581 static inline SimdDouble gmx_simdcall erfc(SimdDouble x)
2582 {
2583     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2584     const SimdDouble CAP4(-0.431780540597889301512e-4);
2585     const SimdDouble CAP3(-0.00578562306260059236059);
2586     const SimdDouble CAP2(-0.028593586920219752446);
2587     const SimdDouble CAP1(-0.315924962948621698209);
2588     const SimdDouble CAP0(0.14952975608477029151);
2589
2590     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2591     const SimdDouble CAQ4(0.00015126584532155383535);
2592     const SimdDouble CAQ3(0.00536692680669480725423);
2593     const SimdDouble CAQ2(0.0668686825594046122636);
2594     const SimdDouble CAQ1(0.402604990869284362773);
2595     // CAQ0 == 1.0
2596     const SimdDouble CAoffset(0.9788494110107421875);
2597
2598     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2599     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2600     const SimdDouble CBP5(0.00119770193298159629350136085658);
2601     const SimdDouble CBP4(0.0164944422378370965881008942733);
2602     const SimdDouble CBP3(0.0984581468691775932063932439252);
2603     const SimdDouble CBP2(0.317364595806937763843589437418);
2604     const SimdDouble CBP1(0.554167062641455850932670067075);
2605     const SimdDouble CBP0(0.427583576155807163756925301060);
2606     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2607     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2608     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2609     const SimdDouble CBQ4(0.955364736493055670530981883072);
2610     const SimdDouble CBQ3(2.36815675631420037315349279199);
2611     const SimdDouble CBQ2(3.55261649184083035537184223542);
2612     const SimdDouble CBQ1(2.93501136050160872574376997993);
2613     // CBQ0 == 1.0
2614
2615     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2616     const SimdDouble CCP6(-2.8175401114513378771);
2617     const SimdDouble CCP5(-3.22729451764143718517);
2618     const SimdDouble CCP4(-2.5518551727311523996);
2619     const SimdDouble CCP3(-0.687717681153649930619);
2620     const SimdDouble CCP2(-0.212652252872804219852);
2621     const SimdDouble CCP1(0.0175389834052493308818);
2622     const SimdDouble CCP0(0.00628057170626964891937);
2623
2624     const SimdDouble CCQ6(5.48409182238641741584);
2625     const SimdDouble CCQ5(13.5064170191802889145);
2626     const SimdDouble CCQ4(22.9367376522880577224);
2627     const SimdDouble CCQ3(15.930646027911794143);
2628     const SimdDouble CCQ2(11.0567237927800161565);
2629     const SimdDouble CCQ1(2.79257750980575282228);
2630     // CCQ0 == 1.0
2631     const SimdDouble CCoffset(0.5579090118408203125);
2632
2633     const SimdDouble one(1.0);
2634     const SimdDouble two(2.0);
2635     const SimdDouble minFloat(std::numeric_limits<float>::min());
2636
2637     SimdDouble xabs, x2, x4, t, t2, w, w2;
2638     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2639     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2640     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2641     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2642     SimdDouble expmx2;
2643     SimdDBool  mask, mask_erf, notmask_erf;
2644
2645     // Calculate erf()
2646     xabs        = abs(x);
2647     mask_erf    = (xabs < one);
2648     notmask_erf = (one <= xabs);
2649     x2          = x * x;
2650     x4          = x2 * x2;
2651
2652     PolyAP0 = fma(CAP4, x4, CAP2);
2653     PolyAP1 = fma(CAP3, x4, CAP1);
2654     PolyAP0 = fma(PolyAP0, x4, CAP0);
2655     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2656     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2657     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2658     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2659     PolyAQ0 = fma(PolyAQ0, x4, one);
2660     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2661
2662     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2663     res_erf = CAoffset + res_erf;
2664     res_erf = x * res_erf;
2665
2666     // Calculate erfc() in range [1,4.5]
2667     t  = xabs - one;
2668     t2 = t * t;
2669
2670     PolyBP0 = fma(CBP6, t2, CBP4);
2671     PolyBP1 = fma(CBP5, t2, CBP3);
2672     PolyBP0 = fma(PolyBP0, t2, CBP2);
2673     PolyBP1 = fma(PolyBP1, t2, CBP1);
2674     PolyBP0 = fma(PolyBP0, t2, CBP0);
2675     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2676
2677     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2678     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2679     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2680     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2681     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2682     PolyBQ0 = fma(PolyBQ0, t2, one);
2683     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2684
2685     // The denominator polynomial can be zero outside the range
2686     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2687
2688     res_erfcB = res_erfcB * xabs;
2689
2690     // Calculate erfc() in range [4.5,inf]
2691     // Note that 1/x can only handle single precision!
2692     w  = maskzInv(xabs, minFloat <= xabs);
2693     w2 = w * w;
2694
2695     PolyCP0 = fma(CCP6, w2, CCP4);
2696     PolyCP1 = fma(CCP5, w2, CCP3);
2697     PolyCP0 = fma(PolyCP0, w2, CCP2);
2698     PolyCP1 = fma(PolyCP1, w2, CCP1);
2699     PolyCP0 = fma(PolyCP0, w2, CCP0);
2700     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2701
2702     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2703     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2704     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2705     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2706     PolyCQ0 = fma(PolyCQ0, w2, one);
2707     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2708
2709     expmx2 = exp(-x2);
2710
2711     // The denominator polynomial can be zero outside the range
2712     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2713     res_erfcC = res_erfcC + CCoffset;
2714     res_erfcC = res_erfcC * w;
2715
2716     mask     = (SimdDouble(4.5) < xabs);
2717     res_erfc = blend(res_erfcB, res_erfcC, mask);
2718
2719     res_erfc = res_erfc * expmx2;
2720
2721     // erfc(x<0) = 2-erfc(|x|)
2722     mask     = (x < setZero());
2723     res_erfc = blend(res_erfc, two - res_erfc, mask);
2724
2725     // Select erf() or erfc()
2726     res = blend(res_erfc, one - res_erf, mask_erf);
2727
2728     return res;
2729 }
2730
2731 /*! \brief SIMD double sin \& cos.
2732  *
2733  * \param x The argument to evaluate sin/cos for
2734  * \param[out] sinval Sin(x)
2735  * \param[out] cosval Cos(x)
2736  *
2737  * This version achieves close to machine precision, but for very large
2738  * magnitudes of the argument we inherently begin to lose accuracy due to the
2739  * argument reduction, despite using extended precision arithmetics internally.
2740  */
2741 static inline void gmx_simdcall sincos(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
2742 {
2743     // Constants to subtract Pi/4*x from y while minimizing precision loss
2744     const SimdDouble argred0(-2 * 0.78539816290140151978);
2745     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2746     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2747     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2748     const SimdDouble two_over_pi(2.0 / M_PI);
2749     const SimdDouble const_sin5(1.58938307283228937328511e-10);
2750     const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2751     const SimdDouble const_sin3(2.75573131776846360512547e-06);
2752     const SimdDouble const_sin2(-0.000198412698278911770864914);
2753     const SimdDouble const_sin1(0.0083333333333191845961746);
2754     const SimdDouble const_sin0(-0.166666666666666130709393);
2755
2756     const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2757     const SimdDouble const_cos6(2.08757471207040055479366e-09);
2758     const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2759     const SimdDouble const_cos4(2.48015872890001867311915e-05);
2760     const SimdDouble const_cos3(-0.00138888888888714019282329);
2761     const SimdDouble const_cos2(0.0416666666666665519592062);
2762     const SimdDouble half(0.5);
2763     const SimdDouble one(1.0);
2764     SimdDouble       ssign, csign;
2765     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
2766     SimdDBool        mask;
2767 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2768     const SimdDInt32 ione(1);
2769     const SimdDInt32 itwo(2);
2770     SimdDInt32       iy;
2771
2772     z  = x * two_over_pi;
2773     iy = cvtR2I(z);
2774     y  = round(z);
2775
2776     mask  = cvtIB2B((iy & ione) == setZero());
2777     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2778     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2779 #        else
2780     const SimdDouble quarter(0.25);
2781     const SimdDouble minusquarter(-0.25);
2782     SimdDouble       q;
2783     SimdDBool        m1, m2, m3;
2784
2785     /* The most obvious way to find the arguments quadrant in the unit circle
2786      * to calculate the sign is to use integer arithmetic, but that is not
2787      * present in all SIMD implementations. As an alternative, we have devised a
2788      * pure floating-point algorithm that uses truncation for argument reduction
2789      * so that we get a new value 0<=q<1 over the unit circle, and then
2790      * do floating-point comparisons with fractions. This is likely to be
2791      * slightly slower (~10%) due to the longer latencies of floating-point, so
2792      * we only use it when integer SIMD arithmetic is not present.
2793      */
2794     ssign = x;
2795     x     = abs(x);
2796     // It is critical that half-way cases are rounded down
2797     z = fma(x, two_over_pi, half);
2798     y = trunc(z);
2799     q = z * quarter;
2800     q = q - trunc(q);
2801     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2802      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2803      * This removes the 2*Pi periodicity without using any integer arithmetic.
2804      * First check if y had the value 2 or 3, set csign if true.
2805      */
2806     q = q - half;
2807     /* If we have logical operations we can work directly on the signbit, which
2808      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2809      * Thus, if you are altering defines to debug alternative code paths, the
2810      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2811      * active or inactive - you will get errors if only one is used.
2812      */
2813 #            if GMX_SIMD_HAVE_LOGICAL
2814     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2815     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2816     ssign = ssign ^ csign;
2817 #            else
2818     ssign = copysign(SimdDouble(1.0), ssign);
2819     csign = copysign(SimdDouble(1.0), q);
2820     csign = -csign;
2821     ssign = ssign * csign; // swap ssign if csign was set.
2822 #            endif
2823     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2824     m1   = (q < minusquarter);
2825     m2   = (setZero() <= q);
2826     m3   = (q < quarter);
2827     m2   = m2 && m3;
2828     mask = m1 || m2;
2829     // where mask is FALSE, swap sign.
2830     csign   = csign * blend(SimdDouble(-1.0), one, mask);
2831 #        endif
2832     x  = fma(y, argred0, x);
2833     x  = fma(y, argred1, x);
2834     x  = fma(y, argred2, x);
2835     x  = fma(y, argred3, x);
2836     x2 = x * x;
2837
2838     psin = fma(const_sin5, x2, const_sin4);
2839     psin = fma(psin, x2, const_sin3);
2840     psin = fma(psin, x2, const_sin2);
2841     psin = fma(psin, x2, const_sin1);
2842     psin = fma(psin, x2, const_sin0);
2843     psin = fma(psin, x2 * x, x);
2844
2845     pcos = fma(const_cos7, x2, const_cos6);
2846     pcos = fma(pcos, x2, const_cos5);
2847     pcos = fma(pcos, x2, const_cos4);
2848     pcos = fma(pcos, x2, const_cos3);
2849     pcos = fma(pcos, x2, const_cos2);
2850     pcos = fms(pcos, x2, half);
2851     pcos = fma(pcos, x2, one);
2852
2853     sss = blend(pcos, psin, mask);
2854     ccc = blend(psin, pcos, mask);
2855     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2856 #        if GMX_SIMD_HAVE_LOGICAL
2857     *sinval = sss ^ ssign;
2858     *cosval = ccc ^ csign;
2859 #        else
2860     *sinval = sss * ssign;
2861     *cosval = ccc * csign;
2862 #        endif
2863 }
2864
2865 /*! \brief SIMD double sin(x).
2866  *
2867  * \param x The argument to evaluate sin for
2868  * \result Sin(x)
2869  *
2870  * \attention Do NOT call both sin & cos if you need both results, since each of them
2871  * will then call \ref sincos and waste a factor 2 in performance.
2872  */
2873 static inline SimdDouble gmx_simdcall sin(SimdDouble x)
2874 {
2875     SimdDouble s, c;
2876     sincos(x, &s, &c);
2877     return s;
2878 }
2879
2880 /*! \brief SIMD double cos(x).
2881  *
2882  * \param x The argument to evaluate cos for
2883  * \result Cos(x)
2884  *
2885  * \attention Do NOT call both sin & cos if you need both results, since each of them
2886  * will then call \ref sincos and waste a factor 2 in performance.
2887  */
2888 static inline SimdDouble gmx_simdcall cos(SimdDouble x)
2889 {
2890     SimdDouble s, c;
2891     sincos(x, &s, &c);
2892     return c;
2893 }
2894
2895 /*! \brief SIMD double tan(x).
2896  *
2897  * \param x The argument to evaluate tan for
2898  * \result Tan(x)
2899  */
2900 static inline SimdDouble gmx_simdcall tan(SimdDouble x)
2901 {
2902     const SimdDouble argred0(-2 * 0.78539816290140151978);
2903     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2904     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2905     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2906     const SimdDouble two_over_pi(2.0 / M_PI);
2907     const SimdDouble CT15(1.01419718511083373224408e-05);
2908     const SimdDouble CT14(-2.59519791585924697698614e-05);
2909     const SimdDouble CT13(5.23388081915899855325186e-05);
2910     const SimdDouble CT12(-3.05033014433946488225616e-05);
2911     const SimdDouble CT11(7.14707504084242744267497e-05);
2912     const SimdDouble CT10(8.09674518280159187045078e-05);
2913     const SimdDouble CT9(0.000244884931879331847054404);
2914     const SimdDouble CT8(0.000588505168743587154904506);
2915     const SimdDouble CT7(0.00145612788922812427978848);
2916     const SimdDouble CT6(0.00359208743836906619142924);
2917     const SimdDouble CT5(0.00886323944362401618113356);
2918     const SimdDouble CT4(0.0218694882853846389592078);
2919     const SimdDouble CT3(0.0539682539781298417636002);
2920     const SimdDouble CT2(0.133333333333125941821962);
2921     const SimdDouble CT1(0.333333333333334980164153);
2922     const SimdDouble minFloat(std::numeric_limits<float>::min());
2923
2924     SimdDouble x2, p, y, z;
2925     SimdDBool  m;
2926
2927 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2928     SimdDInt32 iy;
2929     SimdDInt32 ione(1);
2930
2931     z  = x * two_over_pi;
2932     iy = cvtR2I(z);
2933     y  = round(z);
2934     m  = cvtIB2B((iy & ione) == ione);
2935
2936     x = fma(y, argred0, x);
2937     x = fma(y, argred1, x);
2938     x = fma(y, argred2, x);
2939     x = fma(y, argred3, x);
2940     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2941 #        else
2942     const SimdDouble quarter(0.25);
2943     const SimdDouble half(0.5);
2944     const SimdDouble threequarter(0.75);
2945     const SimdDouble minFloat(std::numeric_limits<float>::min());
2946     SimdDouble       w, q;
2947     SimdDBool        m1, m2, m3;
2948
2949     w  = abs(x);
2950     z  = fma(w, two_over_pi, half);
2951     y  = trunc(z);
2952     q  = z * quarter;
2953     q  = q - trunc(q);
2954     m1 = (quarter <= q);
2955     m2 = (q < half);
2956     m3 = (threequarter <= q);
2957     m1 = m1 && m2;
2958     m  = m1 || m3;
2959     w  = fma(y, argred0, w);
2960     w  = fma(y, argred1, w);
2961     w  = fma(y, argred2, w);
2962     w  = fma(y, argred3, w);
2963
2964     w     = blend(w, -w, m);
2965     x     = w * copysign(SimdDouble(1.0), x);
2966 #        endif
2967     x2 = x * x;
2968     p  = fma(CT15, x2, CT14);
2969     p  = fma(p, x2, CT13);
2970     p  = fma(p, x2, CT12);
2971     p  = fma(p, x2, CT11);
2972     p  = fma(p, x2, CT10);
2973     p  = fma(p, x2, CT9);
2974     p  = fma(p, x2, CT8);
2975     p  = fma(p, x2, CT7);
2976     p  = fma(p, x2, CT6);
2977     p  = fma(p, x2, CT5);
2978     p  = fma(p, x2, CT4);
2979     p  = fma(p, x2, CT3);
2980     p  = fma(p, x2, CT2);
2981     p  = fma(p, x2, CT1);
2982     p  = fma(x2, p * x, x);
2983
2984     p = blend(p, maskzInv(p, m && (minFloat < abs(p))), m);
2985     return p;
2986 }
2987
2988 /*! \brief SIMD double asin(x).
2989  *
2990  * \param x The argument to evaluate asin for
2991  * \result Asin(x)
2992  */
2993 static inline SimdDouble gmx_simdcall asin(SimdDouble x)
2994 {
2995     // Same algorithm as cephes library
2996     const SimdDouble limit1(0.625);
2997     const SimdDouble limit2(1e-8);
2998     const SimdDouble one(1.0);
2999     const SimdDouble quarterpi(M_PI / 4.0);
3000     const SimdDouble morebits(6.123233995736765886130e-17);
3001
3002     const SimdDouble P5(4.253011369004428248960e-3);
3003     const SimdDouble P4(-6.019598008014123785661e-1);
3004     const SimdDouble P3(5.444622390564711410273e0);
3005     const SimdDouble P2(-1.626247967210700244449e1);
3006     const SimdDouble P1(1.956261983317594739197e1);
3007     const SimdDouble P0(-8.198089802484824371615e0);
3008
3009     const SimdDouble Q4(-1.474091372988853791896e1);
3010     const SimdDouble Q3(7.049610280856842141659e1);
3011     const SimdDouble Q2(-1.471791292232726029859e2);
3012     const SimdDouble Q1(1.395105614657485689735e2);
3013     const SimdDouble Q0(-4.918853881490881290097e1);
3014
3015     const SimdDouble R4(2.967721961301243206100e-3);
3016     const SimdDouble R3(-5.634242780008963776856e-1);
3017     const SimdDouble R2(6.968710824104713396794e0);
3018     const SimdDouble R1(-2.556901049652824852289e1);
3019     const SimdDouble R0(2.853665548261061424989e1);
3020
3021     const SimdDouble S3(-2.194779531642920639778e1);
3022     const SimdDouble S2(1.470656354026814941758e2);
3023     const SimdDouble S1(-3.838770957603691357202e2);
3024     const SimdDouble S0(3.424398657913078477438e2);
3025
3026     SimdDouble xabs;
3027     SimdDouble zz, ww, z, q, w, zz2, ww2;
3028     SimdDouble PA, PB;
3029     SimdDouble QA, QB;
3030     SimdDouble RA, RB;
3031     SimdDouble SA, SB;
3032     SimdDouble nom, denom;
3033     SimdDBool  mask, mask2;
3034
3035     xabs = abs(x);
3036
3037     mask = (limit1 < xabs);
3038
3039     zz  = one - xabs;
3040     ww  = xabs * xabs;
3041     zz2 = zz * zz;
3042     ww2 = ww * ww;
3043
3044     // R
3045     RA = fma(R4, zz2, R2);
3046     RB = fma(R3, zz2, R1);
3047     RA = fma(RA, zz2, R0);
3048     RA = fma(RB, zz, RA);
3049
3050     // S, SA = zz2
3051     SB = fma(S3, zz2, S1);
3052     SA = zz2 + S2;
3053     SA = fma(SA, zz2, S0);
3054     SA = fma(SB, zz, SA);
3055
3056     // P
3057     PA = fma(P5, ww2, P3);
3058     PB = fma(P4, ww2, P2);
3059     PA = fma(PA, ww2, P1);
3060     PB = fma(PB, ww2, P0);
3061     PA = fma(PA, ww, PB);
3062
3063     // Q, QA = ww2
3064     QB = fma(Q4, ww2, Q2);
3065     QA = ww2 + Q3;
3066     QA = fma(QA, ww2, Q1);
3067     QB = fma(QB, ww2, Q0);
3068     QA = fma(QA, ww, QB);
3069
3070     RA = RA * zz;
3071     PA = PA * ww;
3072
3073     nom   = blend(PA, RA, mask);
3074     denom = blend(QA, SA, mask);
3075
3076     mask2 = (limit2 < xabs);
3077     q     = nom * maskzInv(denom, mask2);
3078
3079     zz = zz + zz;
3080     zz = sqrt(zz);
3081     z  = quarterpi - zz;
3082     zz = fms(zz, q, morebits);
3083     z  = z - zz;
3084     z  = z + quarterpi;
3085
3086     w = xabs * q;
3087     w = w + xabs;
3088
3089     z = blend(w, z, mask);
3090
3091     z = blend(xabs, z, mask2);
3092
3093     z = copysign(z, x);
3094
3095     return z;
3096 }
3097
3098 /*! \brief SIMD double acos(x).
3099  *
3100  * \param x The argument to evaluate acos for
3101  * \result Acos(x)
3102  */
3103 static inline SimdDouble gmx_simdcall acos(SimdDouble x)
3104 {
3105     const SimdDouble one(1.0);
3106     const SimdDouble half(0.5);
3107     const SimdDouble quarterpi0(7.85398163397448309616e-1);
3108     const SimdDouble quarterpi1(6.123233995736765886130e-17);
3109
3110     SimdDBool  mask1;
3111     SimdDouble z, z1, z2;
3112
3113     mask1 = (half < x);
3114     z1    = half * (one - x);
3115     z1    = sqrt(z1);
3116     z     = blend(x, z1, mask1);
3117
3118     z = asin(z);
3119
3120     z1 = z + z;
3121
3122     z2 = quarterpi0 - z;
3123     z2 = z2 + quarterpi1;
3124     z2 = z2 + quarterpi0;
3125
3126     z = blend(z2, z1, mask1);
3127
3128     return z;
3129 }
3130
3131 /*! \brief SIMD double asin(x).
3132  *
3133  * \param x The argument to evaluate atan for
3134  * \result Atan(x), same argument/value range as standard math library.
3135  */
3136 static inline SimdDouble gmx_simdcall atan(SimdDouble x)
3137 {
3138     // Same algorithm as cephes library
3139     const SimdDouble limit1(0.66);
3140     const SimdDouble limit2(2.41421356237309504880);
3141     const SimdDouble quarterpi(M_PI / 4.0);
3142     const SimdDouble halfpi(M_PI / 2.0);
3143     const SimdDouble mone(-1.0);
3144     const SimdDouble morebits1(0.5 * 6.123233995736765886130E-17);
3145     const SimdDouble morebits2(6.123233995736765886130E-17);
3146
3147     const SimdDouble P4(-8.750608600031904122785E-1);
3148     const SimdDouble P3(-1.615753718733365076637E1);
3149     const SimdDouble P2(-7.500855792314704667340E1);
3150     const SimdDouble P1(-1.228866684490136173410E2);
3151     const SimdDouble P0(-6.485021904942025371773E1);
3152
3153     const SimdDouble Q4(2.485846490142306297962E1);
3154     const SimdDouble Q3(1.650270098316988542046E2);
3155     const SimdDouble Q2(4.328810604912902668951E2);
3156     const SimdDouble Q1(4.853903996359136964868E2);
3157     const SimdDouble Q0(1.945506571482613964425E2);
3158
3159     SimdDouble y, xabs, t1, t2;
3160     SimdDouble z, z2;
3161     SimdDouble P_A, P_B, Q_A, Q_B;
3162     SimdDBool  mask1, mask2;
3163
3164     xabs = abs(x);
3165
3166     mask1 = (limit1 < xabs);
3167     mask2 = (limit2 < xabs);
3168
3169     t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
3170     t2 = mone * maskzInv(xabs, mask2);
3171
3172     y    = selectByMask(quarterpi, mask1);
3173     y    = blend(y, halfpi, mask2);
3174     xabs = blend(xabs, t1, mask1);
3175     xabs = blend(xabs, t2, mask2);
3176
3177     z  = xabs * xabs;
3178     z2 = z * z;
3179
3180     P_A = fma(P4, z2, P2);
3181     P_B = fma(P3, z2, P1);
3182     P_A = fma(P_A, z2, P0);
3183     P_A = fma(P_B, z, P_A);
3184
3185     // Q_A = z2
3186     Q_B = fma(Q4, z2, Q2);
3187     Q_A = z2 + Q3;
3188     Q_A = fma(Q_A, z2, Q1);
3189     Q_B = fma(Q_B, z2, Q0);
3190     Q_A = fma(Q_A, z, Q_B);
3191
3192     z = z * P_A;
3193     z = z * inv(Q_A);
3194     z = fma(z, xabs, xabs);
3195
3196     t1 = selectByMask(morebits1, mask1);
3197     t1 = blend(t1, morebits2, mask2);
3198
3199     z = z + t1;
3200     y = y + z;
3201
3202     y = copysign(y, x);
3203
3204     return y;
3205 }
3206
3207 /*! \brief SIMD double atan2(y,x).
3208  *
3209  * \param y Y component of vector, any quartile
3210  * \param x X component of vector, any quartile
3211  * \result Atan(y,x), same argument/value range as standard math library.
3212  *
3213  * \note This routine should provide correct results for all finite
3214  * non-zero or positive-zero arguments. However, negative zero arguments will
3215  * be treated as positive zero, which means the return value will deviate from
3216  * the standard math library atan2(y,x) for those cases. That should not be
3217  * of any concern in Gromacs, and in particular it will not affect calculations
3218  * of angles from vectors.
3219  */
3220 static inline SimdDouble gmx_simdcall atan2(SimdDouble y, SimdDouble x)
3221 {
3222     const SimdDouble pi(M_PI);
3223     const SimdDouble halfpi(M_PI / 2.0);
3224     const SimdDouble minFloat(std::numeric_limits<float>::min());
3225     SimdDouble       xinv, p, aoffset;
3226     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3227
3228     mask_xnz  = x != setZero();
3229     mask_ynz  = y != setZero();
3230     mask_xlt0 = (x < setZero());
3231     mask_ylt0 = (y < setZero());
3232
3233     aoffset = selectByNotMask(halfpi, mask_xnz);
3234     aoffset = selectByMask(aoffset, mask_ynz);
3235
3236     aoffset = blend(aoffset, pi, mask_xlt0);
3237     aoffset = blend(aoffset, -aoffset, mask_ylt0);
3238
3239     xinv = maskzInv(x, mask_xnz && (minFloat <= abs(x)));
3240     p    = y * xinv;
3241     p    = atan(p);
3242     p    = p + aoffset;
3243
3244     return p;
3245 }
3246
3247
3248 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3249  *
3250  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3251  *           interaction distance and beta the ewald splitting parameters.
3252  * \result Correction factor to coulomb force.
3253  *
3254  * This routine is meant to enable analytical evaluation of the
3255  * direct-space PME electrostatic force to avoid tables. For details, see the
3256  * single precision function.
3257  */
3258 static inline SimdDouble gmx_simdcall pmeForceCorrection(SimdDouble z2)
3259 {
3260     const SimdDouble FN10(-8.0072854618360083154e-14);
3261     const SimdDouble FN9(1.1859116242260148027e-11);
3262     const SimdDouble FN8(-8.1490406329798423616e-10);
3263     const SimdDouble FN7(3.4404793543907847655e-8);
3264     const SimdDouble FN6(-9.9471420832602741006e-7);
3265     const SimdDouble FN5(0.000020740315999115847456);
3266     const SimdDouble FN4(-0.00031991745139313364005);
3267     const SimdDouble FN3(0.0035074449373659008203);
3268     const SimdDouble FN2(-0.031750380176100813405);
3269     const SimdDouble FN1(0.13884101728898463426);
3270     const SimdDouble FN0(-0.75225277815249618847);
3271
3272     const SimdDouble FD5(0.000016009278224355026701);
3273     const SimdDouble FD4(0.00051055686934806966046);
3274     const SimdDouble FD3(0.0081803507497974289008);
3275     const SimdDouble FD2(0.077181146026670287235);
3276     const SimdDouble FD1(0.41543303143712535988);
3277     const SimdDouble FD0(1.0);
3278
3279     SimdDouble z4;
3280     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3281
3282     z4 = z2 * z2;
3283
3284     polyFD1 = fma(FD5, z4, FD3);
3285     polyFD1 = fma(polyFD1, z4, FD1);
3286     polyFD1 = polyFD1 * z2;
3287     polyFD0 = fma(FD4, z4, FD2);
3288     polyFD0 = fma(polyFD0, z4, FD0);
3289     polyFD0 = polyFD0 + polyFD1;
3290
3291     polyFD0 = inv(polyFD0);
3292
3293     polyFN0 = fma(FN10, z4, FN8);
3294     polyFN0 = fma(polyFN0, z4, FN6);
3295     polyFN0 = fma(polyFN0, z4, FN4);
3296     polyFN0 = fma(polyFN0, z4, FN2);
3297     polyFN0 = fma(polyFN0, z4, FN0);
3298     polyFN1 = fma(FN9, z4, FN7);
3299     polyFN1 = fma(polyFN1, z4, FN5);
3300     polyFN1 = fma(polyFN1, z4, FN3);
3301     polyFN1 = fma(polyFN1, z4, FN1);
3302     polyFN0 = fma(polyFN1, z2, polyFN0);
3303
3304
3305     return polyFN0 * polyFD0;
3306 }
3307
3308
3309 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3310  *
3311  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3312  *           interaction distance and beta the ewald splitting parameters.
3313  * \result Correction factor to coulomb force.
3314  *
3315  * This routine is meant to enable analytical evaluation of the
3316  * direct-space PME electrostatic potential to avoid tables. For details, see the
3317  * single precision function.
3318  */
3319 static inline SimdDouble gmx_simdcall pmePotentialCorrection(SimdDouble z2)
3320 {
3321     const SimdDouble VN9(-9.3723776169321855475e-13);
3322     const SimdDouble VN8(1.2280156762674215741e-10);
3323     const SimdDouble VN7(-7.3562157912251309487e-9);
3324     const SimdDouble VN6(2.6215886208032517509e-7);
3325     const SimdDouble VN5(-4.9532491651265819499e-6);
3326     const SimdDouble VN4(0.00025907400778966060389);
3327     const SimdDouble VN3(0.0010585044856156469792);
3328     const SimdDouble VN2(0.045247661136833092885);
3329     const SimdDouble VN1(0.11643931522926034421);
3330     const SimdDouble VN0(1.1283791671726767970);
3331
3332     const SimdDouble VD5(0.000021784709867336150342);
3333     const SimdDouble VD4(0.00064293662010911388448);
3334     const SimdDouble VD3(0.0096311444822588683504);
3335     const SimdDouble VD2(0.085608012351550627051);
3336     const SimdDouble VD1(0.43652499166614811084);
3337     const SimdDouble VD0(1.0);
3338
3339     SimdDouble z4;
3340     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3341
3342     z4 = z2 * z2;
3343
3344     polyVD1 = fma(VD5, z4, VD3);
3345     polyVD0 = fma(VD4, z4, VD2);
3346     polyVD1 = fma(polyVD1, z4, VD1);
3347     polyVD0 = fma(polyVD0, z4, VD0);
3348     polyVD0 = fma(polyVD1, z2, polyVD0);
3349
3350     polyVD0 = inv(polyVD0);
3351
3352     polyVN1 = fma(VN9, z4, VN7);
3353     polyVN0 = fma(VN8, z4, VN6);
3354     polyVN1 = fma(polyVN1, z4, VN5);
3355     polyVN0 = fma(polyVN0, z4, VN4);
3356     polyVN1 = fma(polyVN1, z4, VN3);
3357     polyVN0 = fma(polyVN0, z4, VN2);
3358     polyVN1 = fma(polyVN1, z4, VN1);
3359     polyVN0 = fma(polyVN0, z4, VN0);
3360     polyVN0 = fma(polyVN1, z2, polyVN0);
3361
3362     return polyVN0 * polyVD0;
3363 }
3364
3365 /*! \} */
3366
3367
3368 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3369  *
3370  *  \note In some cases we do not need full double accuracy of individual
3371  *        SIMD math functions, although the data is stored in double precision
3372  *        SIMD registers. This might be the case for special algorithms, or
3373  *        if the architecture does not support single precision.
3374  *        Since the full double precision evaluation of math functions
3375  *        typically require much more expensive polynomial approximations
3376  *        these functions implement the algorithms used in the single precision
3377  *        SIMD math functions, but they operate on double precision
3378  *        SIMD variables.
3379  *
3380  *  \{
3381  */
3382
3383 /*********************************************************************
3384  * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3385  *********************************************************************/
3386
3387 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3388  *
3389  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3390  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3391  *           For the single precision implementation this is obviously always
3392  *           true for positive values, but for double precision it adds an
3393  *           extra restriction since the first lookup step might have to be
3394  *           performed in single precision on some architectures. Note that the
3395  *           responsibility for checking falls on you - this routine does not
3396  *           check arguments.
3397  *
3398  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
3399  */
3400 static inline SimdDouble gmx_simdcall invsqrtSingleAccuracy(SimdDouble x)
3401 {
3402     SimdDouble lu = rsqrt(x);
3403 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3404     lu = rsqrtIter(lu, x);
3405 #        endif
3406 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3407     lu = rsqrtIter(lu, x);
3408 #        endif
3409 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3410     lu = rsqrtIter(lu, x);
3411 #        endif
3412     return lu;
3413 }
3414
3415 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3416  *
3417  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3418  *  Illegal values in the masked-out elements will not lead to
3419  *  floating-point exceptions.
3420  *
3421  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3422  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3423  *           For the single precision implementation this is obviously always
3424  *           true for positive values, but for double precision it adds an
3425  *           extra restriction since the first lookup step might have to be
3426  *           performed in single precision on some architectures. Note that the
3427  *           responsibility for checking falls on you - this routine does not
3428  *           check arguments.
3429  *
3430  *  \param m Mask
3431  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
3432  *          entry was not masked, and 0.0 for masked-out entries.
3433  */
3434 static inline SimdDouble maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
3435 {
3436     SimdDouble lu = maskzRsqrt(x, m);
3437 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3438     lu = rsqrtIter(lu, x);
3439 #        endif
3440 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3441     lu = rsqrtIter(lu, x);
3442 #        endif
3443 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3444     lu = rsqrtIter(lu, x);
3445 #        endif
3446     return lu;
3447 }
3448
3449 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3450  *
3451  * \param x0  First set of arguments, x0 must be in single range (see below).
3452  * \param x1  Second set of arguments, x1 must be in single range (see below).
3453  * \param[out] out0  Result 1/sqrt(x0)
3454  * \param[out] out1  Result 1/sqrt(x1)
3455  *
3456  *  In particular for double precision we can sometimes calculate square root
3457  *  pairs slightly faster by using single precision until the very last step.
3458  *
3459  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3460  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
3461  *       For the single precision implementation this is obviously always
3462  *       true for positive values, but for double precision it adds an
3463  *       extra restriction since the first lookup step might have to be
3464  *       performed in single precision on some architectures. Note that the
3465  *       responsibility for checking falls on you - this routine does not
3466  *       check arguments.
3467  */
3468 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdDouble  x0,
3469                                                           SimdDouble  x1,
3470                                                           SimdDouble* out0,
3471                                                           SimdDouble* out1)
3472 {
3473 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3474                 && (GMX_SIMD_RSQRT_BITS < 22)
3475     SimdFloat  xf  = cvtDD2F(x0, x1);
3476     SimdFloat  luf = rsqrt(xf);
3477     SimdDouble lu0, lu1;
3478     // Intermediate target is single - mantissa+1 bits
3479 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3480     luf = rsqrtIter(luf, xf);
3481 #            endif
3482 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3483     luf = rsqrtIter(luf, xf);
3484 #            endif
3485 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3486     luf = rsqrtIter(luf, xf);
3487 #            endif
3488     cvtF2DD(luf, &lu0, &lu1);
3489     // We now have single-precision accuracy values in lu0/lu1
3490     *out0 = lu0;
3491     *out1 = lu1;
3492 #        else
3493     *out0 = invsqrtSingleAccuracy(x0);
3494     *out1 = invsqrtSingleAccuracy(x1);
3495 #        endif
3496 }
3497
3498 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3499  *
3500  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3501  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3502  *           For the single precision implementation this is obviously always
3503  *           true for positive values, but for double precision it adds an
3504  *           extra restriction since the first lookup step might have to be
3505  *           performed in single precision on some architectures. Note that the
3506  *           responsibility for checking falls on you - this routine does not
3507  *           check arguments.
3508  *
3509  *  \return 1/x. Result is undefined if your argument was invalid.
3510  */
3511 static inline SimdDouble gmx_simdcall invSingleAccuracy(SimdDouble x)
3512 {
3513     SimdDouble lu = rcp(x);
3514 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3515     lu = rcpIter(lu, x);
3516 #        endif
3517 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3518     lu = rcpIter(lu, x);
3519 #        endif
3520 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3521     lu = rcpIter(lu, x);
3522 #        endif
3523     return lu;
3524 }
3525
3526 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3527  *
3528  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3529  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3530  *           For the single precision implementation this is obviously always
3531  *           true for positive values, but for double precision it adds an
3532  *           extra restriction since the first lookup step might have to be
3533  *           performed in single precision on some architectures. Note that the
3534  *           responsibility for checking falls on you - this routine does not
3535  *           check arguments.
3536  *
3537  *  \param m Mask
3538  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3539  */
3540 static inline SimdDouble gmx_simdcall maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3541 {
3542     SimdDouble lu = maskzRcp(x, m);
3543 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3544     lu = rcpIter(lu, x);
3545 #        endif
3546 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3547     lu = rcpIter(lu, x);
3548 #        endif
3549 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3550     lu = rcpIter(lu, x);
3551 #        endif
3552     return lu;
3553 }
3554
3555
3556 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3557  *
3558  *  \copydetails sqrt(SimdFloat)
3559  */
3560 template<MathOptimization opt = MathOptimization::Safe>
3561 static inline SimdDouble gmx_simdcall sqrtSingleAccuracy(SimdDouble x)
3562 {
3563     if (opt == MathOptimization::Safe)
3564     {
3565         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3566         return res * x;
3567     }
3568     else
3569     {
3570         return x * invsqrtSingleAccuracy(x);
3571     }
3572 }
3573
3574 /*! \brief Cube root for SIMD doubles, single accuracy.
3575  *
3576  * \param x      Argument to calculate cube root of. Can be negative or zero,
3577  *               but NaN or Inf values are not supported. Denormal values will
3578  *               be treated as 0.0.
3579  * \return       Cube root of x.
3580  */
3581 static inline SimdDouble gmx_simdcall cbrtSingleAccuracy(SimdDouble x)
3582 {
3583     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3584     const SimdDouble minDouble(std::numeric_limits<double>::min());
3585     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3586     // Use the divided value as original constant to avoid division warnings.
3587     const std::int32_t offsetDiv3(341);
3588     const SimdDouble   c2(-0.191502161678719066);
3589     const SimdDouble   c1(0.697570460207922770);
3590     const SimdDouble   c0(0.492659620528969547);
3591     const SimdDouble   one(1.0);
3592     const SimdDouble   two(2.0);
3593     const SimdDouble   three(3.0);
3594     const SimdDouble   oneThird(1.0 / 3.0);
3595     const SimdDouble   cbrt2(1.2599210498948731648);
3596     const SimdDouble   sqrCbrt2(1.5874010519681994748);
3597
3598     // See the single precision routines for documentation of the algorithm
3599
3600     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3601     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
3602     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
3603
3604     SimdDInt32 exponent;
3605     SimdDouble y        = frexp(xAbs, &exponent);
3606     SimdDouble z        = fma(fma(y, c2, c1), y, c0);
3607     SimdDouble w        = z * z * z;
3608     SimdDouble nom      = z * fma(two, y, w);
3609     SimdDouble invDenom = inv(fma(two, w, y));
3610
3611     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3612     SimdDouble offsetExpDiv3 =
3613             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3614     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
3615     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
3616     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
3617     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
3618     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3619     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
3620     return result;
3621 }
3622
3623 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3624  *
3625  * \param x      Argument to calculate cube root of. Can be positive or
3626  *               negative, but the magnitude cannot be lower than
3627  *               the smallest normal number.
3628  * \return       Cube root of x. Undefined for values that don't
3629  *               fulfill the restriction of abs(x) > minDouble.
3630  */
3631 static inline SimdDouble gmx_simdcall invcbrtSingleAccuracy(SimdDouble x)
3632 {
3633     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3634     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3635     // Use the divided value as original constant to avoid division warnings.
3636     const std::int32_t offsetDiv3(341);
3637     const SimdDouble   c2(-0.191502161678719066);
3638     const SimdDouble   c1(0.697570460207922770);
3639     const SimdDouble   c0(0.492659620528969547);
3640     const SimdDouble   one(1.0);
3641     const SimdDouble   two(2.0);
3642     const SimdDouble   three(3.0);
3643     const SimdDouble   oneThird(1.0 / 3.0);
3644     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
3645     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
3646
3647     // See the single precision routines for documentation of the algorithm
3648
3649     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3650     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3651
3652     SimdDInt32 exponent;
3653     SimdDouble y         = frexp(xAbs, &exponent);
3654     SimdDouble z         = fma(fma(y, c2, c1), y, c0);
3655     SimdDouble w         = z * z * z;
3656     SimdDouble nom       = fma(two, w, y);
3657     SimdDouble invDenom  = inv(z * fma(two, y, w));
3658     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3659     SimdDouble offsetExpDiv3 =
3660             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3661     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
3662     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
3663     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
3664     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
3665     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3666     SimdDouble result    = ldexp(fraction, expDiv3);
3667     return result;
3668 }
3669
3670 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3671  *
3672  * \param x Argument, should be >0.
3673  * \result The base 2 logarithm of x. Undefined if argument is invalid.
3674  */
3675 static inline SimdDouble gmx_simdcall log2SingleAccuracy(SimdDouble x)
3676 {
3677 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3678     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
3679 #        else
3680     const SimdDouble one(1.0);
3681     const SimdDouble two(2.0);
3682     const SimdDouble sqrt2(std::sqrt(2.0));
3683     const SimdDouble CL9(0.342149508897807708152F);
3684     const SimdDouble CL7(0.411570606888219447939F);
3685     const SimdDouble CL5(0.577085979152320294183F);
3686     const SimdDouble CL3(0.961796550607099898222F);
3687     const SimdDouble CL1(2.885390081777926774009F);
3688     SimdDouble       fexp, x2, p;
3689     SimdDInt32       iexp;
3690     SimdDBool        mask;
3691
3692     x    = frexp(x, &iexp);
3693     fexp = cvtI2R(iexp);
3694
3695     mask = (x < sqrt2);
3696     // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3697     fexp = fexp - selectByMask(one, mask);
3698     x    = x * blend(one, two, mask);
3699
3700     x  = (x - one) * invSingleAccuracy(x + one);
3701     x2 = x * x;
3702
3703     p = fma(CL9, x2, CL7);
3704     p = fma(p, x2, CL5);
3705     p = fma(p, x2, CL3);
3706     p = fma(p, x2, CL1);
3707     p = fma(p, x, fexp);
3708
3709     return p;
3710 #        endif
3711 }
3712
3713 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3714  *
3715  * \param x Argument, should be >0.
3716  * \result The natural logarithm of x. Undefined if argument is invalid.
3717  */
3718 static inline SimdDouble gmx_simdcall logSingleAccuracy(SimdDouble x)
3719 {
3720 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3721     return log(x);
3722 #        else
3723     const SimdDouble one(1.0);
3724     const SimdDouble two(2.0);
3725     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
3726     const SimdDouble corr(0.693147180559945286226764);
3727     const SimdDouble CL9(0.2371599674224853515625);
3728     const SimdDouble CL7(0.285279005765914916992188);
3729     const SimdDouble CL5(0.400005519390106201171875);
3730     const SimdDouble CL3(0.666666567325592041015625);
3731     const SimdDouble CL1(2.0);
3732     SimdDouble       fexp, x2, p;
3733     SimdDInt32       iexp;
3734     SimdDBool        mask;
3735
3736     x    = frexp(x, &iexp);
3737     fexp = cvtI2R(iexp);
3738
3739     mask = x < invsqrt2;
3740     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3741     fexp = fexp - selectByMask(one, mask);
3742     x    = x * blend(one, two, mask);
3743
3744     x  = (x - one) * invSingleAccuracy(x + one);
3745     x2 = x * x;
3746
3747     p = fma(CL9, x2, CL7);
3748     p = fma(p, x2, CL5);
3749     p = fma(p, x2, CL3);
3750     p = fma(p, x2, CL1);
3751     p = fma(p, x, corr * fexp);
3752
3753     return p;
3754 #        endif
3755 }
3756
3757 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3758  *
3759  * \copydetails exp2(SimdFloat)
3760  */
3761 template<MathOptimization opt = MathOptimization::Safe>
3762 static inline SimdDouble gmx_simdcall exp2SingleAccuracy(SimdDouble x)
3763 {
3764 #        if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3765     return exp2(x);
3766 #        else
3767     const SimdDouble CC6(0.0001534581200287996416911311);
3768     const SimdDouble CC5(0.001339993121934088894618990);
3769     const SimdDouble CC4(0.009618488957115180159497841);
3770     const SimdDouble CC3(0.05550328776964726865751735);
3771     const SimdDouble CC2(0.2402264689063408646490722);
3772     const SimdDouble CC1(0.6931472057372680777553816);
3773     const SimdDouble one(1.0);
3774
3775     SimdDouble intpart;
3776     SimdDouble p;
3777     SimdDInt32 ix;
3778
3779     // Large negative values are valid arguments to exp2(), so there are two
3780     // things we need to account for:
3781     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3782     //    zero and we can no longer multiply with it. There are special IEEE
3783     //    formats to handle this range, but for now we have to accept that
3784     //    we cannot handle those arguments. If input value becomes even more
3785     //    negative, it will start to loop and we would end up with invalid
3786     //    exponents. Thus, we need to limit or mask this.
3787     // 2. For VERY large negative values, we will have problems that the
3788     //    subtraction to get the fractional part loses accuracy, and then we
3789     //    can end up with overflows in the polynomial.
3790     //
3791     // For now, we handle this by forwarding the math optimization setting to
3792     // ldexp, where the routine will return zero for very small arguments.
3793     //
3794     // However, before doing that we need to make sure we do not call cvtR2I
3795     // with an argument that is so negative it cannot be converted to an integer.
3796     if (opt == MathOptimization::Safe)
3797     {
3798         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3799     }
3800
3801     ix      = cvtR2I(x);
3802     intpart = round(x);
3803     x       = x - intpart;
3804
3805     p = fma(CC6, x, CC5);
3806     p = fma(p, x, CC4);
3807     p = fma(p, x, CC3);
3808     p = fma(p, x, CC2);
3809     p = fma(p, x, CC1);
3810     p = fma(p, x, one);
3811     x = ldexp<opt>(p, ix);
3812
3813     return x;
3814 #        endif
3815 }
3816
3817
3818 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3819  *
3820  * \copydetails exp(SimdFloat)
3821  */
3822 template<MathOptimization opt = MathOptimization::Safe>
3823 static inline SimdDouble gmx_simdcall expSingleAccuracy(SimdDouble x)
3824 {
3825 #        if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3826     return exp(x);
3827 #        else
3828     const SimdDouble argscale(1.44269504088896341);
3829     // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3830     const SimdDouble smallArgLimit(-709.0895657128);
3831     const SimdDouble invargscale(-0.69314718055994528623);
3832     const SimdDouble CC4(0.00136324646882712841033936);
3833     const SimdDouble CC3(0.00836596917361021041870117);
3834     const SimdDouble CC2(0.0416710823774337768554688);
3835     const SimdDouble CC1(0.166665524244308471679688);
3836     const SimdDouble CC0(0.499999850988388061523438);
3837     const SimdDouble one(1.0);
3838     SimdDouble       intpart;
3839     SimdDouble       y, p;
3840     SimdDInt32       iy;
3841
3842     // Large negative values are valid arguments to exp2(), so there are two
3843     // things we need to account for:
3844     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3845     //    zero and we can no longer multiply with it. There are special IEEE
3846     //    formats to handle this range, but for now we have to accept that
3847     //    we cannot handle those arguments. If input value becomes even more
3848     //    negative, it will start to loop and we would end up with invalid
3849     //    exponents. Thus, we need to limit or mask this.
3850     // 2. For VERY large negative values, we will have problems that the
3851     //    subtraction to get the fractional part loses accuracy, and then we
3852     //    can end up with overflows in the polynomial.
3853     //
3854     // For now, we handle this by forwarding the math optimization setting to
3855     // ldexp, where the routine will return zero for very small arguments.
3856     //
3857     // However, before doing that we need to make sure we do not call cvtR2I
3858     // with an argument that is so negative it cannot be converted to an integer
3859     // after being multiplied by argscale.
3860
3861     if (opt == MathOptimization::Safe)
3862     {
3863         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
3864     }
3865
3866     y = x * argscale;
3867
3868     iy      = cvtR2I(y);
3869     intpart = round(y); // use same rounding algorithm here
3870
3871     // Extended precision arithmetics not needed since
3872     // we have double precision and only need single accuracy.
3873     x = fma(invargscale, intpart, x);
3874
3875     p = fma(CC4, x, CC3);
3876     p = fma(p, x, CC2);
3877     p = fma(p, x, CC1);
3878     p = fma(p, x, CC0);
3879     p = fma(x * x, p, x);
3880     p = p + one;
3881     x = ldexp<opt>(p, iy);
3882     return x;
3883 #        endif
3884 }
3885
3886 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3887  *
3888  * This returns x^y for SIMD values.
3889  *
3890  * \tparam opt If this is changed from the default (safe) into the unsafe
3891  *             option, there are no guarantees about correct results for x==0.
3892  *
3893  * \param x Base.
3894  *
3895  * \param y exponent.
3896
3897  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3898  *         depending on the underlying implementation. If unsafe optimizations
3899  *         are enabled, this is also true for x==0.
3900  *
3901  * \warning You cannot rely on this implementation returning inf for arguments
3902  *          that cause overflow. If you have some very large
3903  *          values and need to rely on getting a valid numerical output,
3904  *          take the minimum of your variable and the largest valid argument
3905  *          before calling this routine.
3906  */
3907 template<MathOptimization opt = MathOptimization::Safe>
3908 static inline SimdDouble gmx_simdcall powSingleAccuracy(SimdDouble x, SimdDouble y)
3909 {
3910     SimdDouble xcorr;
3911
3912     if (opt == MathOptimization::Safe)
3913     {
3914         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
3915     }
3916     else
3917     {
3918         xcorr = x;
3919     }
3920
3921     SimdDouble result = exp2SingleAccuracy<opt>(y * log2SingleAccuracy(xcorr));
3922
3923     if (opt == MathOptimization::Safe)
3924     {
3925         // if x==0 and y>0 we explicitly set the result to 0.0
3926         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3927         result = blend(result, setZero(), x == setZero() && setZero() < y);
3928     }
3929
3930     return result;
3931 }
3932
3933 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3934  *
3935  * \param x The value to calculate erf(x) for.
3936  * \result erf(x)
3937  *
3938  * This routine achieves very close to single precision, but we do not care about
3939  * the last bit or the subnormal result range.
3940  */
3941 static inline SimdDouble gmx_simdcall erfSingleAccuracy(SimdDouble x)
3942 {
3943     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3944     const SimdDouble CA6(7.853861353153693e-5);
3945     const SimdDouble CA5(-8.010193625184903e-4);
3946     const SimdDouble CA4(5.188327685732524e-3);
3947     const SimdDouble CA3(-2.685381193529856e-2);
3948     const SimdDouble CA2(1.128358514861418e-1);
3949     const SimdDouble CA1(-3.761262582423300e-1);
3950     const SimdDouble CA0(1.128379165726710);
3951     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3952     const SimdDouble CB9(-0.0018629930017603923);
3953     const SimdDouble CB8(0.003909821287598495);
3954     const SimdDouble CB7(-0.0052094582210355615);
3955     const SimdDouble CB6(0.005685614362160572);
3956     const SimdDouble CB5(-0.0025367682853477272);
3957     const SimdDouble CB4(-0.010199799682318782);
3958     const SimdDouble CB3(0.04369575504816542);
3959     const SimdDouble CB2(-0.11884063474674492);
3960     const SimdDouble CB1(0.2732120154030589);
3961     const SimdDouble CB0(0.42758357702025784);
3962     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3963     const SimdDouble CC10(-0.0445555913112064);
3964     const SimdDouble CC9(0.21376355144663348);
3965     const SimdDouble CC8(-0.3473187200259257);
3966     const SimdDouble CC7(0.016690861551248114);
3967     const SimdDouble CC6(0.7560973182491192);
3968     const SimdDouble CC5(-1.2137903600145787);
3969     const SimdDouble CC4(0.8411872321232948);
3970     const SimdDouble CC3(-0.08670413896296343);
3971     const SimdDouble CC2(-0.27124782687240334);
3972     const SimdDouble CC1(-0.0007502488047806069);
3973     const SimdDouble CC0(0.5642114853803148);
3974     const SimdDouble one(1.0);
3975     const SimdDouble two(2.0);
3976
3977     SimdDouble x2, x4, y;
3978     SimdDouble t, t2, w, w2;
3979     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3980     SimdDouble expmx2;
3981     SimdDouble res_erf, res_erfc, res;
3982     SimdDBool  mask, msk_erf;
3983
3984     // Calculate erf()
3985     x2 = x * x;
3986     x4 = x2 * x2;
3987
3988     pA0 = fma(CA6, x4, CA4);
3989     pA1 = fma(CA5, x4, CA3);
3990     pA0 = fma(pA0, x4, CA2);
3991     pA1 = fma(pA1, x4, CA1);
3992     pA0 = pA0 * x4;
3993     pA0 = fma(pA1, x2, pA0);
3994     // Constant term must come last for precision reasons
3995     pA0 = pA0 + CA0;
3996
3997     res_erf = x * pA0;
3998
3999     // Calculate erfc
4000     y       = abs(x);
4001     msk_erf = (SimdDouble(0.75) <= y);
4002     t       = maskzInvSingleAccuracy(y, msk_erf);
4003     w       = t - one;
4004     t2      = t * t;
4005     w2      = w * w;
4006
4007     expmx2 = expSingleAccuracy(-y * y);
4008
4009     pB1 = fma(CB9, w2, CB7);
4010     pB0 = fma(CB8, w2, CB6);
4011     pB1 = fma(pB1, w2, CB5);
4012     pB0 = fma(pB0, w2, CB4);
4013     pB1 = fma(pB1, w2, CB3);
4014     pB0 = fma(pB0, w2, CB2);
4015     pB1 = fma(pB1, w2, CB1);
4016     pB0 = fma(pB0, w2, CB0);
4017     pB0 = fma(pB1, w, pB0);
4018
4019     pC0 = fma(CC10, t2, CC8);
4020     pC1 = fma(CC9, t2, CC7);
4021     pC0 = fma(pC0, t2, CC6);
4022     pC1 = fma(pC1, t2, CC5);
4023     pC0 = fma(pC0, t2, CC4);
4024     pC1 = fma(pC1, t2, CC3);
4025     pC0 = fma(pC0, t2, CC2);
4026     pC1 = fma(pC1, t2, CC1);
4027
4028     pC0 = fma(pC0, t2, CC0);
4029     pC0 = fma(pC1, t, pC0);
4030     pC0 = pC0 * t;
4031
4032     // Select pB0 or pC0 for erfc()
4033     mask     = (two < y);
4034     res_erfc = blend(pB0, pC0, mask);
4035     res_erfc = res_erfc * expmx2;
4036
4037     // erfc(x<0) = 2-erfc(|x|)
4038     mask     = (x < setZero());
4039     res_erfc = blend(res_erfc, two - res_erfc, mask);
4040
4041     // Select erf() or erfc()
4042     mask = (y < SimdDouble(0.75));
4043     res  = blend(one - res_erfc, res_erf, mask);
4044
4045     return res;
4046 }
4047
4048 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4049  *
4050  * \param x The value to calculate erfc(x) for.
4051  * \result erfc(x)
4052  *
4053  * This routine achieves singleprecision (bar the last bit) over most of the
4054  * input range, but for large arguments where the result is getting close
4055  * to the minimum representable numbers we accept slightly larger errors
4056  * (think results that are in the ballpark of 10^-30) since that is not
4057  * relevant for MD.
4058  */
4059 static inline SimdDouble gmx_simdcall erfcSingleAccuracy(SimdDouble x)
4060 {
4061     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4062     const SimdDouble CA6(7.853861353153693e-5);
4063     const SimdDouble CA5(-8.010193625184903e-4);
4064     const SimdDouble CA4(5.188327685732524e-3);
4065     const SimdDouble CA3(-2.685381193529856e-2);
4066     const SimdDouble CA2(1.128358514861418e-1);
4067     const SimdDouble CA1(-3.761262582423300e-1);
4068     const SimdDouble CA0(1.128379165726710);
4069     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4070     const SimdDouble CB9(-0.0018629930017603923);
4071     const SimdDouble CB8(0.003909821287598495);
4072     const SimdDouble CB7(-0.0052094582210355615);
4073     const SimdDouble CB6(0.005685614362160572);
4074     const SimdDouble CB5(-0.0025367682853477272);
4075     const SimdDouble CB4(-0.010199799682318782);
4076     const SimdDouble CB3(0.04369575504816542);
4077     const SimdDouble CB2(-0.11884063474674492);
4078     const SimdDouble CB1(0.2732120154030589);
4079     const SimdDouble CB0(0.42758357702025784);
4080     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4081     const SimdDouble CC10(-0.0445555913112064);
4082     const SimdDouble CC9(0.21376355144663348);
4083     const SimdDouble CC8(-0.3473187200259257);
4084     const SimdDouble CC7(0.016690861551248114);
4085     const SimdDouble CC6(0.7560973182491192);
4086     const SimdDouble CC5(-1.2137903600145787);
4087     const SimdDouble CC4(0.8411872321232948);
4088     const SimdDouble CC3(-0.08670413896296343);
4089     const SimdDouble CC2(-0.27124782687240334);
4090     const SimdDouble CC1(-0.0007502488047806069);
4091     const SimdDouble CC0(0.5642114853803148);
4092     const SimdDouble one(1.0);
4093     const SimdDouble two(2.0);
4094
4095     SimdDouble x2, x4, y;
4096     SimdDouble t, t2, w, w2;
4097     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
4098     SimdDouble expmx2;
4099     SimdDouble res_erf, res_erfc, res;
4100     SimdDBool  mask, msk_erf;
4101
4102     // Calculate erf()
4103     x2 = x * x;
4104     x4 = x2 * x2;
4105
4106     pA0 = fma(CA6, x4, CA4);
4107     pA1 = fma(CA5, x4, CA3);
4108     pA0 = fma(pA0, x4, CA2);
4109     pA1 = fma(pA1, x4, CA1);
4110     pA1 = pA1 * x2;
4111     pA0 = fma(pA0, x4, pA1);
4112     // Constant term must come last for precision reasons
4113     pA0 = pA0 + CA0;
4114
4115     res_erf = x * pA0;
4116
4117     // Calculate erfc
4118     y       = abs(x);
4119     msk_erf = (SimdDouble(0.75) <= y);
4120     t       = maskzInvSingleAccuracy(y, msk_erf);
4121     w       = t - one;
4122     t2      = t * t;
4123     w2      = w * w;
4124
4125     expmx2 = expSingleAccuracy(-y * y);
4126
4127     pB1 = fma(CB9, w2, CB7);
4128     pB0 = fma(CB8, w2, CB6);
4129     pB1 = fma(pB1, w2, CB5);
4130     pB0 = fma(pB0, w2, CB4);
4131     pB1 = fma(pB1, w2, CB3);
4132     pB0 = fma(pB0, w2, CB2);
4133     pB1 = fma(pB1, w2, CB1);
4134     pB0 = fma(pB0, w2, CB0);
4135     pB0 = fma(pB1, w, pB0);
4136
4137     pC0 = fma(CC10, t2, CC8);
4138     pC1 = fma(CC9, t2, CC7);
4139     pC0 = fma(pC0, t2, CC6);
4140     pC1 = fma(pC1, t2, CC5);
4141     pC0 = fma(pC0, t2, CC4);
4142     pC1 = fma(pC1, t2, CC3);
4143     pC0 = fma(pC0, t2, CC2);
4144     pC1 = fma(pC1, t2, CC1);
4145
4146     pC0 = fma(pC0, t2, CC0);
4147     pC0 = fma(pC1, t, pC0);
4148     pC0 = pC0 * t;
4149
4150     // Select pB0 or pC0 for erfc()
4151     mask     = (two < y);
4152     res_erfc = blend(pB0, pC0, mask);
4153     res_erfc = res_erfc * expmx2;
4154
4155     // erfc(x<0) = 2-erfc(|x|)
4156     mask     = (x < setZero());
4157     res_erfc = blend(res_erfc, two - res_erfc, mask);
4158
4159     // Select erf() or erfc()
4160     mask = (y < SimdDouble(0.75));
4161     res  = blend(res_erfc, one - res_erf, mask);
4162
4163     return res;
4164 }
4165
4166 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4167  *
4168  * \param x The argument to evaluate sin/cos for
4169  * \param[out] sinval Sin(x)
4170  * \param[out] cosval Cos(x)
4171  */
4172 static inline void gmx_simdcall sinCosSingleAccuracy(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
4173 {
4174     // Constants to subtract Pi/4*x from y while minimizing precision loss
4175     const SimdDouble argred0(2 * 0.78539816290140151978);
4176     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4177     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4178     const SimdDouble two_over_pi(2.0 / M_PI);
4179     const SimdDouble const_sin2(-1.9515295891e-4);
4180     const SimdDouble const_sin1(8.3321608736e-3);
4181     const SimdDouble const_sin0(-1.6666654611e-1);
4182     const SimdDouble const_cos2(2.443315711809948e-5);
4183     const SimdDouble const_cos1(-1.388731625493765e-3);
4184     const SimdDouble const_cos0(4.166664568298827e-2);
4185
4186     const SimdDouble half(0.5);
4187     const SimdDouble one(1.0);
4188     SimdDouble       ssign, csign;
4189     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
4190     SimdDBool        mask;
4191
4192 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4193     const SimdDInt32 ione(1);
4194     const SimdDInt32 itwo(2);
4195     SimdDInt32       iy;
4196
4197     z  = x * two_over_pi;
4198     iy = cvtR2I(z);
4199     y  = round(z);
4200
4201     mask  = cvtIB2B((iy & ione) == setZero());
4202     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
4203     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
4204 #        else
4205     const SimdDouble quarter(0.25);
4206     const SimdDouble minusquarter(-0.25);
4207     SimdDouble       q;
4208     SimdDBool        m1, m2, m3;
4209
4210     /* The most obvious way to find the arguments quadrant in the unit circle
4211      * to calculate the sign is to use integer arithmetic, but that is not
4212      * present in all SIMD implementations. As an alternative, we have devised a
4213      * pure floating-point algorithm that uses truncation for argument reduction
4214      * so that we get a new value 0<=q<1 over the unit circle, and then
4215      * do floating-point comparisons with fractions. This is likely to be
4216      * slightly slower (~10%) due to the longer latencies of floating-point, so
4217      * we only use it when integer SIMD arithmetic is not present.
4218      */
4219     ssign = x;
4220     x     = abs(x);
4221     // It is critical that half-way cases are rounded down
4222     z = fma(x, two_over_pi, half);
4223     y = trunc(z);
4224     q = z * quarter;
4225     q = q - trunc(q);
4226     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4227      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4228      * This removes the 2*Pi periodicity without using any integer arithmetic.
4229      * First check if y had the value 2 or 3, set csign if true.
4230      */
4231     q = q - half;
4232     /* If we have logical operations we can work directly on the signbit, which
4233      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4234      * Thus, if you are altering defines to debug alternative code paths, the
4235      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4236      * active or inactive - you will get errors if only one is used.
4237      */
4238 #            if GMX_SIMD_HAVE_LOGICAL
4239     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
4240     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
4241     ssign = ssign ^ csign;
4242 #            else
4243     ssign = copysign(SimdDouble(1.0), ssign);
4244     csign = copysign(SimdDouble(1.0), q);
4245     csign = -csign;
4246     ssign = ssign * csign; // swap ssign if csign was set.
4247 #            endif
4248     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4249     m1   = (q < minusquarter);
4250     m2   = (setZero() <= q);
4251     m3   = (q < quarter);
4252     m2   = m2 && m3;
4253     mask = m1 || m2;
4254     // where mask is FALSE, swap sign.
4255     csign   = csign * blend(SimdDouble(-1.0), one, mask);
4256 #        endif
4257     x  = fnma(y, argred0, x);
4258     x  = fnma(y, argred1, x);
4259     x  = fnma(y, argred2, x);
4260     x2 = x * x;
4261
4262     psin = fma(const_sin2, x2, const_sin1);
4263     psin = fma(psin, x2, const_sin0);
4264     psin = fma(psin, x * x2, x);
4265     pcos = fma(const_cos2, x2, const_cos1);
4266     pcos = fma(pcos, x2, const_cos0);
4267     pcos = fms(pcos, x2, half);
4268     pcos = fma(pcos, x2, one);
4269
4270     sss = blend(pcos, psin, mask);
4271     ccc = blend(psin, pcos, mask);
4272     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4273 #        if GMX_SIMD_HAVE_LOGICAL
4274     *sinval = sss ^ ssign;
4275     *cosval = ccc ^ csign;
4276 #        else
4277     *sinval = sss * ssign;
4278     *cosval = ccc * csign;
4279 #        endif
4280 }
4281
4282 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4283  *
4284  * \param x The argument to evaluate sin for
4285  * \result Sin(x)
4286  *
4287  * \attention Do NOT call both sin & cos if you need both results, since each of them
4288  * will then call \ref sincos and waste a factor 2 in performance.
4289  */
4290 static inline SimdDouble gmx_simdcall sinSingleAccuracy(SimdDouble x)
4291 {
4292     SimdDouble s, c;
4293     sinCosSingleAccuracy(x, &s, &c);
4294     return s;
4295 }
4296
4297 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4298  *
4299  * \param x The argument to evaluate cos for
4300  * \result Cos(x)
4301  *
4302  * \attention Do NOT call both sin & cos if you need both results, since each of them
4303  * will then call \ref sincos and waste a factor 2 in performance.
4304  */
4305 static inline SimdDouble gmx_simdcall cosSingleAccuracy(SimdDouble x)
4306 {
4307     SimdDouble s, c;
4308     sinCosSingleAccuracy(x, &s, &c);
4309     return c;
4310 }
4311
4312 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4313  *
4314  * \param x The argument to evaluate tan for
4315  * \result Tan(x)
4316  */
4317 static inline SimdDouble gmx_simdcall tanSingleAccuracy(SimdDouble x)
4318 {
4319     const SimdDouble argred0(2 * 0.78539816290140151978);
4320     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4321     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4322     const SimdDouble two_over_pi(2.0 / M_PI);
4323     const SimdDouble CT6(0.009498288995810566122993911);
4324     const SimdDouble CT5(0.002895755790837379295226923);
4325     const SimdDouble CT4(0.02460087336161924491836265);
4326     const SimdDouble CT3(0.05334912882656359828045988);
4327     const SimdDouble CT2(0.1333989091464957704418495);
4328     const SimdDouble CT1(0.3333307599244198227797507);
4329
4330     SimdDouble x2, p, y, z;
4331     SimdDBool  mask;
4332
4333 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4334     SimdDInt32 iy;
4335     SimdDInt32 ione(1);
4336
4337     z    = x * two_over_pi;
4338     iy   = cvtR2I(z);
4339     y    = round(z);
4340     mask = cvtIB2B((iy & ione) == ione);
4341
4342     x = fnma(y, argred0, x);
4343     x = fnma(y, argred1, x);
4344     x = fnma(y, argred2, x);
4345     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
4346 #        else
4347     const SimdDouble quarter(0.25);
4348     const SimdDouble half(0.5);
4349     const SimdDouble threequarter(0.75);
4350     SimdDouble       w, q;
4351     SimdDBool        m1, m2, m3;
4352
4353     w    = abs(x);
4354     z    = fma(w, two_over_pi, half);
4355     y    = trunc(z);
4356     q    = z * quarter;
4357     q    = q - trunc(q);
4358     m1   = (quarter <= q);
4359     m2   = (q < half);
4360     m3   = (threequarter <= q);
4361     m1   = m1 && m2;
4362     mask = m1 || m3;
4363     w    = fnma(y, argred0, w);
4364     w    = fnma(y, argred1, w);
4365     w    = fnma(y, argred2, w);
4366
4367     w = blend(w, -w, mask);
4368     x = w * copysign(SimdDouble(1.0), x);
4369 #        endif
4370     x2 = x * x;
4371     p  = fma(CT6, x2, CT5);
4372     p  = fma(p, x2, CT4);
4373     p  = fma(p, x2, CT3);
4374     p  = fma(p, x2, CT2);
4375     p  = fma(p, x2, CT1);
4376     p  = fma(x2, p * x, x);
4377
4378     p = blend(p, maskzInvSingleAccuracy(p, mask), mask);
4379     return p;
4380 }
4381
4382 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4383  *
4384  * \param x The argument to evaluate asin for
4385  * \result Asin(x)
4386  */
4387 static inline SimdDouble gmx_simdcall asinSingleAccuracy(SimdDouble x)
4388 {
4389     const SimdDouble limitlow(1e-4);
4390     const SimdDouble half(0.5);
4391     const SimdDouble one(1.0);
4392     const SimdDouble halfpi(M_PI / 2.0);
4393     const SimdDouble CC5(4.2163199048E-2);
4394     const SimdDouble CC4(2.4181311049E-2);
4395     const SimdDouble CC3(4.5470025998E-2);
4396     const SimdDouble CC2(7.4953002686E-2);
4397     const SimdDouble CC1(1.6666752422E-1);
4398     SimdDouble       xabs;
4399     SimdDouble       z, z1, z2, q, q1, q2;
4400     SimdDouble       pA, pB;
4401     SimdDBool        mask, mask2;
4402
4403     xabs  = abs(x);
4404     mask  = (half < xabs);
4405     z1    = half * (one - xabs);
4406     mask2 = (xabs < one);
4407     q1    = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
4408     q2    = xabs;
4409     z2    = q2 * q2;
4410     z     = blend(z2, z1, mask);
4411     q     = blend(q2, q1, mask);
4412
4413     z2 = z * z;
4414     pA = fma(CC5, z2, CC3);
4415     pB = fma(CC4, z2, CC2);
4416     pA = fma(pA, z2, CC1);
4417     pA = pA * z;
4418     z  = fma(pB, z2, pA);
4419     z  = fma(z, q, q);
4420     q2 = halfpi - z;
4421     q2 = q2 - z;
4422     z  = blend(z, q2, mask);
4423
4424     mask = (limitlow < xabs);
4425     z    = blend(xabs, z, mask);
4426     z    = copysign(z, x);
4427
4428     return z;
4429 }
4430
4431 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4432  *
4433  * \param x The argument to evaluate acos for
4434  * \result Acos(x)
4435  */
4436 static inline SimdDouble gmx_simdcall acosSingleAccuracy(SimdDouble x)
4437 {
4438     const SimdDouble one(1.0);
4439     const SimdDouble half(0.5);
4440     const SimdDouble pi(M_PI);
4441     const SimdDouble halfpi(M_PI / 2.0);
4442     SimdDouble       xabs;
4443     SimdDouble       z, z1, z2, z3;
4444     SimdDBool        mask1, mask2, mask3;
4445
4446     xabs  = abs(x);
4447     mask1 = (half < xabs);
4448     mask2 = (setZero() < x);
4449
4450     z     = half * (one - xabs);
4451     mask3 = (xabs < one);
4452     z     = z * maskzInvsqrtSingleAccuracy(z, mask3);
4453     z     = blend(x, z, mask1);
4454     z     = asinSingleAccuracy(z);
4455
4456     z2 = z + z;
4457     z1 = pi - z2;
4458     z3 = halfpi - z;
4459     z  = blend(z1, z2, mask2);
4460     z  = blend(z3, z, mask1);
4461
4462     return z;
4463 }
4464
4465 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4466  *
4467  * \param x The argument to evaluate atan for
4468  * \result Atan(x), same argument/value range as standard math library.
4469  */
4470 static inline SimdDouble gmx_simdcall atanSingleAccuracy(SimdDouble x)
4471 {
4472     const SimdDouble halfpi(M_PI / 2);
4473     const SimdDouble CA17(0.002823638962581753730774);
4474     const SimdDouble CA15(-0.01595690287649631500244);
4475     const SimdDouble CA13(0.04250498861074447631836);
4476     const SimdDouble CA11(-0.07489009201526641845703);
4477     const SimdDouble CA9(0.1063479334115982055664);
4478     const SimdDouble CA7(-0.1420273631811141967773);
4479     const SimdDouble CA5(0.1999269574880599975585);
4480     const SimdDouble CA3(-0.3333310186862945556640);
4481     SimdDouble       x2, x3, x4, pA, pB;
4482     SimdDBool        mask, mask2;
4483
4484     mask  = (x < setZero());
4485     x     = abs(x);
4486     mask2 = (SimdDouble(1.0) < x);
4487     x     = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
4488
4489     x2 = x * x;
4490     x3 = x2 * x;
4491     x4 = x2 * x2;
4492     pA = fma(CA17, x4, CA13);
4493     pB = fma(CA15, x4, CA11);
4494     pA = fma(pA, x4, CA9);
4495     pB = fma(pB, x4, CA7);
4496     pA = fma(pA, x4, CA5);
4497     pB = fma(pB, x4, CA3);
4498     pA = fma(pA, x2, pB);
4499     pA = fma(pA, x3, x);
4500
4501     pA = blend(pA, halfpi - pA, mask2);
4502     pA = blend(pA, -pA, mask);
4503
4504     return pA;
4505 }
4506
4507 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4508  *
4509  * \param y Y component of vector, any quartile
4510  * \param x X component of vector, any quartile
4511  * \result Atan(y,x), same argument/value range as standard math library.
4512  *
4513  * \note This routine should provide correct results for all finite
4514  * non-zero or positive-zero arguments. However, negative zero arguments will
4515  * be treated as positive zero, which means the return value will deviate from
4516  * the standard math library atan2(y,x) for those cases. That should not be
4517  * of any concern in Gromacs, and in particular it will not affect calculations
4518  * of angles from vectors.
4519  */
4520 static inline SimdDouble gmx_simdcall atan2SingleAccuracy(SimdDouble y, SimdDouble x)
4521 {
4522     const SimdDouble pi(M_PI);
4523     const SimdDouble halfpi(M_PI / 2.0);
4524     SimdDouble       xinv, p, aoffset;
4525     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
4526
4527     mask_xnz  = x != setZero();
4528     mask_ynz  = y != setZero();
4529     mask_xlt0 = (x < setZero());
4530     mask_ylt0 = (y < setZero());
4531
4532     aoffset = selectByNotMask(halfpi, mask_xnz);
4533     aoffset = selectByMask(aoffset, mask_ynz);
4534
4535     aoffset = blend(aoffset, pi, mask_xlt0);
4536     aoffset = blend(aoffset, -aoffset, mask_ylt0);
4537
4538     xinv = maskzInvSingleAccuracy(x, mask_xnz);
4539     p    = y * xinv;
4540     p    = atanSingleAccuracy(p);
4541     p    = p + aoffset;
4542
4543     return p;
4544 }
4545
4546 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4547  *
4548  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4549  * \result Correction factor to coulomb force - see below for details.
4550  *
4551  * This routine is meant to enable analytical evaluation of the
4552  * direct-space PME electrostatic force to avoid tables.
4553  *
4554  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4555  * are some problems evaluating that:
4556  *
4557  * First, the error function is difficult (read: expensive) to
4558  * approxmiate accurately for intermediate to large arguments, and
4559  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4560  * Second, we now try to avoid calculating potentials in Gromacs but
4561  * use forces directly.
4562  *
4563  * We can simply things slight by noting that the PME part is really
4564  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4565  * \f[
4566  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4567  * \f]
4568  * The first term we already have from the inverse square root, so
4569  * that we can leave out of this routine.
4570  *
4571  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4572  * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4573  * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4574  * in this range!
4575  *
4576  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4577  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4578  * then only use even powers. This is another minor optimization, since
4579  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4580  * the vector between the two atoms to get the vectorial force. The
4581  * fastest flops are the ones we can avoid calculating!
4582  *
4583  * So, here's how it should be used:
4584  *
4585  * 1. Calculate \f$r^2\f$.
4586  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4587  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4588  * 4. The return value is the expression:
4589  *
4590  * \f[
4591  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4592  * \f]
4593  *
4594  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4595  *
4596  *  \f[
4597  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4598  *  \f]
4599  *
4600  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4601  *
4602  *  \f[
4603  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4604  *  \f]
4605  *
4606  *    With a bit of math exercise you should be able to confirm that
4607  *    this is exactly
4608  *
4609  *  \f[
4610  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4611  *  \f]
4612  *
4613  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4614  *    and you have your force (divided by \f$r\f$). A final multiplication
4615  *    with the vector connecting the two particles and you have your
4616  *    vectorial force to add to the particles.
4617  *
4618  * This approximation achieves an accuracy slightly lower than 1e-6; when
4619  * added to \f$1/r\f$ the error will be insignificant.
4620  *
4621  */
4622 static inline SimdDouble gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdDouble z2)
4623 {
4624     const SimdDouble FN6(-1.7357322914161492954e-8);
4625     const SimdDouble FN5(1.4703624142580877519e-6);
4626     const SimdDouble FN4(-0.000053401640219807709149);
4627     const SimdDouble FN3(0.0010054721316683106153);
4628     const SimdDouble FN2(-0.019278317264888380590);
4629     const SimdDouble FN1(0.069670166153766424023);
4630     const SimdDouble FN0(-0.75225204789749321333);
4631
4632     const SimdDouble FD4(0.0011193462567257629232);
4633     const SimdDouble FD3(0.014866955030185295499);
4634     const SimdDouble FD2(0.11583842382862377919);
4635     const SimdDouble FD1(0.50736591960530292870);
4636     const SimdDouble FD0(1.0);
4637
4638     SimdDouble z4;
4639     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
4640
4641     z4 = z2 * z2;
4642
4643     polyFD0 = fma(FD4, z4, FD2);
4644     polyFD1 = fma(FD3, z4, FD1);
4645     polyFD0 = fma(polyFD0, z4, FD0);
4646     polyFD0 = fma(polyFD1, z2, polyFD0);
4647
4648     polyFD0 = invSingleAccuracy(polyFD0);
4649
4650     polyFN0 = fma(FN6, z4, FN4);
4651     polyFN1 = fma(FN5, z4, FN3);
4652     polyFN0 = fma(polyFN0, z4, FN2);
4653     polyFN1 = fma(polyFN1, z4, FN1);
4654     polyFN0 = fma(polyFN0, z4, FN0);
4655     polyFN0 = fma(polyFN1, z2, polyFN0);
4656
4657     return polyFN0 * polyFD0;
4658 }
4659
4660
4661 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4662  *
4663  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4664  * \result Correction factor to coulomb potential - see below for details.
4665  *
4666  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4667  * as the input argument.
4668  *
4669  * Here's how it should be used:
4670  *
4671  * 1. Calculate \f$r^2\f$.
4672  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4673  * 3. Evaluate this routine with z^2 as the argument.
4674  * 4. The return value is the expression:
4675  *
4676  *  \f[
4677  *   \frac{\mbox{erf}(z)}{z}
4678  *  \f]
4679  *
4680  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4681  *
4682  *  \f[
4683  *    \frac{\mbox{erf}(r \beta)}{r}
4684  *  \f]
4685  *
4686  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4687  *    and you have your potential.
4688  *
4689  * This approximation achieves an accuracy slightly lower than 1e-6; when
4690  * added to \f$1/r\f$ the error will be insignificant.
4691  */
4692 static inline SimdDouble gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4693 {
4694     const SimdDouble VN6(1.9296833005951166339e-8);
4695     const SimdDouble VN5(-1.4213390571557850962e-6);
4696     const SimdDouble VN4(0.000041603292906656984871);
4697     const SimdDouble VN3(-0.00013134036773265025626);
4698     const SimdDouble VN2(0.038657983986041781264);
4699     const SimdDouble VN1(0.11285044772717598220);
4700     const SimdDouble VN0(1.1283802385263030286);
4701
4702     const SimdDouble VD3(0.0066752224023576045451);
4703     const SimdDouble VD2(0.078647795836373922256);
4704     const SimdDouble VD1(0.43336185284710920150);
4705     const SimdDouble VD0(1.0);
4706
4707     SimdDouble z4;
4708     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4709
4710     z4 = z2 * z2;
4711
4712     polyVD1 = fma(VD3, z4, VD1);
4713     polyVD0 = fma(VD2, z4, VD0);
4714     polyVD0 = fma(polyVD1, z2, polyVD0);
4715
4716     polyVD0 = invSingleAccuracy(polyVD0);
4717
4718     polyVN0 = fma(VN6, z4, VN4);
4719     polyVN1 = fma(VN5, z4, VN3);
4720     polyVN0 = fma(polyVN0, z4, VN2);
4721     polyVN1 = fma(polyVN1, z4, VN1);
4722     polyVN0 = fma(polyVN0, z4, VN0);
4723     polyVN0 = fma(polyVN1, z2, polyVN0);
4724
4725     return polyVN0 * polyVD0;
4726 }
4727
4728 #    endif
4729
4730
4731 /*! \name SIMD4 math functions
4732  *
4733  * \note Only a subset of the math functions are implemented for SIMD4.
4734  *  \{
4735  */
4736
4737
4738 #    if GMX_SIMD4_HAVE_FLOAT
4739
4740 /*************************************************************************
4741  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4742  *************************************************************************/
4743
4744 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4745  *
4746  * This is a low-level routine that should only be used by SIMD math routine
4747  * that evaluates the inverse square root.
4748  *
4749  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4750  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4751  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4752  */
4753 static inline Simd4Float gmx_simdcall rsqrtIter(Simd4Float lu, Simd4Float x)
4754 {
4755     Simd4Float tmp1 = x * lu;
4756     Simd4Float tmp2 = Simd4Float(-0.5F) * lu;
4757     tmp1            = fma(tmp1, lu, Simd4Float(-3.0F));
4758     return tmp1 * tmp2;
4759 }
4760
4761 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4762  *
4763  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4764  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4765  *           For the single precision implementation this is obviously always
4766  *           true for positive values, but for double precision it adds an
4767  *           extra restriction since the first lookup step might have to be
4768  *           performed in single precision on some architectures. Note that the
4769  *           responsibility for checking falls on you - this routine does not
4770  *           check arguments.
4771  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4772  */
4773 static inline Simd4Float gmx_simdcall invsqrt(Simd4Float x)
4774 {
4775     Simd4Float lu = rsqrt(x);
4776 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4777     lu = rsqrtIter(lu, x);
4778 #        endif
4779 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4780     lu = rsqrtIter(lu, x);
4781 #        endif
4782 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4783     lu = rsqrtIter(lu, x);
4784 #        endif
4785     return lu;
4786 }
4787
4788
4789 #    endif // GMX_SIMD4_HAVE_FLOAT
4790
4791
4792 #    if GMX_SIMD4_HAVE_DOUBLE
4793 /*************************************************************************
4794  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4795  *************************************************************************/
4796
4797 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4798  *
4799  * This is a low-level routine that should only be used by SIMD math routine
4800  * that evaluates the inverse square root.
4801  *
4802  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4803  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4804  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4805  */
4806 static inline Simd4Double gmx_simdcall rsqrtIter(Simd4Double lu, Simd4Double x)
4807 {
4808     Simd4Double tmp1 = x * lu;
4809     Simd4Double tmp2 = Simd4Double(-0.5F) * lu;
4810     tmp1             = fma(tmp1, lu, Simd4Double(-3.0F));
4811     return tmp1 * tmp2;
4812 }
4813
4814 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4815  *
4816  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4817  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4818  *           For the single precision implementation this is obviously always
4819  *           true for positive values, but for double precision it adds an
4820  *           extra restriction since the first lookup step might have to be
4821  *           performed in single precision on some architectures. Note that the
4822  *           responsibility for checking falls on you - this routine does not
4823  *           check arguments.
4824  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4825  */
4826 static inline Simd4Double gmx_simdcall invsqrt(Simd4Double x)
4827 {
4828     Simd4Double lu = rsqrt(x);
4829 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4830     lu = rsqrtIter(lu, x);
4831 #        endif
4832 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4833     lu = rsqrtIter(lu, x);
4834 #        endif
4835 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4836     lu = rsqrtIter(lu, x);
4837 #        endif
4838 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4839     lu = rsqrtIter(lu, x);
4840 #        endif
4841     return lu;
4842 }
4843
4844
4845 /**********************************************************************
4846  * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4847  **********************************************************************/
4848
4849 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4850  *
4851  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4852  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4853  *           For the single precision implementation this is obviously always
4854  *           true for positive values, but for double precision it adds an
4855  *           extra restriction since the first lookup step might have to be
4856  *           performed in single precision on some architectures. Note that the
4857  *           responsibility for checking falls on you - this routine does not
4858  *           check arguments.
4859  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4860  */
4861 static inline Simd4Double gmx_simdcall invsqrtSingleAccuracy(Simd4Double x)
4862 {
4863     Simd4Double lu = rsqrt(x);
4864 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4865     lu = rsqrtIter(lu, x);
4866 #        endif
4867 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4868     lu = rsqrtIter(lu, x);
4869 #        endif
4870 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4871     lu = rsqrtIter(lu, x);
4872 #        endif
4873     return lu;
4874 }
4875
4876
4877 #    endif // GMX_SIMD4_HAVE_DOUBLE
4878
4879 /*! \} */
4880
4881 #    if GMX_SIMD_HAVE_FLOAT
4882 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4883  *
4884  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4885  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4886  *           For the single precision implementation this is obviously always
4887  *           true for positive values, but for double precision it adds an
4888  *           extra restriction since the first lookup step might have to be
4889  *           performed in single precision on some architectures. Note that the
4890  *           responsibility for checking falls on you - this routine does not
4891  *           check arguments.
4892  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4893  */
4894 static inline SimdFloat gmx_simdcall invsqrtSingleAccuracy(SimdFloat x)
4895 {
4896     return invsqrt(x);
4897 }
4898
4899 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4900  *
4901  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4902  *  Illegal values in the masked-out elements will not lead to
4903  *  floating-point exceptions.
4904  *
4905  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4906  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4907  *           For the single precision implementation this is obviously always
4908  *           true for positive values, but for double precision it adds an
4909  *           extra restriction since the first lookup step might have to be
4910  *           performed in single precision on some architectures. Note that the
4911  *           responsibility for checking falls on you - this routine does not
4912  *           check arguments.
4913  *  \param m Mask
4914  *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
4915  *           entry was not masked, and 0.0 for masked-out entries.
4916  */
4917 static inline SimdFloat maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4918 {
4919     return maskzInvsqrt(x, m);
4920 }
4921
4922 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4923  *
4924  * \param x0  First set of arguments, x0 must be in single range (see below).
4925  * \param x1  Second set of arguments, x1 must be in single range (see below).
4926  * \param[out] out0  Result 1/sqrt(x0)
4927  * \param[out] out1  Result 1/sqrt(x1)
4928  *
4929  *  In particular for double precision we can sometimes calculate square root
4930  *  pairs slightly faster by using single precision until the very last step.
4931  *
4932  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4933  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
4934  *       For the single precision implementation this is obviously always
4935  *       true for positive values, but for double precision it adds an
4936  *       extra restriction since the first lookup step might have to be
4937  *       performed in single precision on some architectures. Note that the
4938  *       responsibility for checking falls on you - this routine does not
4939  *       check arguments.
4940  */
4941 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
4942 {
4943     return invsqrtPair(x0, x1, out0, out1);
4944 }
4945
4946 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4947  *
4948  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4949  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4950  *           For the single precision implementation this is obviously always
4951  *           true for positive values, but for double precision it adds an
4952  *           extra restriction since the first lookup step might have to be
4953  *           performed in single precision on some architectures. Note that the
4954  *           responsibility for checking falls on you - this routine does not
4955  *           check arguments.
4956  *  \return  1/x. Result is undefined if your argument was invalid.
4957  */
4958 static inline SimdFloat gmx_simdcall invSingleAccuracy(SimdFloat x)
4959 {
4960     return inv(x);
4961 }
4962
4963
4964 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4965  *
4966  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4967  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4968  *           For the single precision implementation this is obviously always
4969  *           true for positive values, but for double precision it adds an
4970  *           extra restriction since the first lookup step might have to be
4971  *           performed in single precision on some architectures. Note that the
4972  *           responsibility for checking falls on you - this routine does not
4973  *           check arguments.
4974  *  \param m Mask
4975  *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
4976  */
4977 static inline SimdFloat maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4978 {
4979     return maskzInv(x, m);
4980 }
4981
4982 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4983  *
4984  * \copydetails sqrt(SimdFloat)
4985  */
4986 template<MathOptimization opt = MathOptimization::Safe>
4987 static inline SimdFloat gmx_simdcall sqrtSingleAccuracy(SimdFloat x)
4988 {
4989     return sqrt<opt>(x);
4990 }
4991
4992 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
4993  *
4994  * \copydetails cbrt(SimdFloat)
4995  */
4996 static inline SimdFloat gmx_simdcall cbrtSingleAccuracy(SimdFloat x)
4997 {
4998     return cbrt(x);
4999 }
5000
5001 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5002  *
5003  * \copydetails cbrt(SimdFloat)
5004  */
5005 static inline SimdFloat gmx_simdcall invcbrtSingleAccuracy(SimdFloat x)
5006 {
5007     return invcbrt(x);
5008 }
5009
5010 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5011  *
5012  * \param x Argument, should be >0.
5013  * \result The base-2 logarithm of x. Undefined if argument is invalid.
5014  */
5015 static inline SimdFloat gmx_simdcall log2SingleAccuracy(SimdFloat x)
5016 {
5017     return log2(x);
5018 }
5019
5020 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5021  *
5022  * \param x Argument, should be >0.
5023  * \result The natural logarithm of x. Undefined if argument is invalid.
5024  */
5025 static inline SimdFloat gmx_simdcall logSingleAccuracy(SimdFloat x)
5026 {
5027     return log(x);
5028 }
5029
5030 /*! \brief SIMD float 2^x, only targeting single accuracy.
5031  *
5032  * \copydetails exp2(SimdFloat)
5033  */
5034 template<MathOptimization opt = MathOptimization::Safe>
5035 static inline SimdFloat gmx_simdcall exp2SingleAccuracy(SimdFloat x)
5036 {
5037     return exp2<opt>(x);
5038 }
5039
5040 /*! \brief SIMD float e^x, only targeting single accuracy.
5041  *
5042  * \copydetails exp(SimdFloat)
5043  */
5044 template<MathOptimization opt = MathOptimization::Safe>
5045 static inline SimdFloat gmx_simdcall expSingleAccuracy(SimdFloat x)
5046 {
5047     return exp<opt>(x);
5048 }
5049
5050 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5051  *
5052  * \copydetails pow(SimdFloat)
5053  */
5054 template<MathOptimization opt = MathOptimization::Safe>
5055 static inline SimdFloat gmx_simdcall powSingleAccuracy(SimdFloat x, SimdFloat y)
5056 {
5057     return pow<opt>(x, y);
5058 }
5059
5060 /*! \brief SIMD float erf(x), only targeting single accuracy.
5061  *
5062  * \param x The value to calculate erf(x) for.
5063  * \result erf(x)
5064  *
5065  * This routine achieves very close to single precision, but we do not care about
5066  * the last bit or the subnormal result range.
5067  */
5068 static inline SimdFloat gmx_simdcall erfSingleAccuracy(SimdFloat x)
5069 {
5070     return erf(x);
5071 }
5072
5073 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5074  *
5075  * \param x The value to calculate erfc(x) for.
5076  * \result erfc(x)
5077  *
5078  * This routine achieves singleprecision (bar the last bit) over most of the
5079  * input range, but for large arguments where the result is getting close
5080  * to the minimum representable numbers we accept slightly larger errors
5081  * (think results that are in the ballpark of 10^-30) since that is not
5082  * relevant for MD.
5083  */
5084 static inline SimdFloat gmx_simdcall erfcSingleAccuracy(SimdFloat x)
5085 {
5086     return erfc(x);
5087 }
5088
5089 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5090  *
5091  * \param x The argument to evaluate sin/cos for
5092  * \param[out] sinval Sin(x)
5093  * \param[out] cosval Cos(x)
5094  */
5095 static inline void gmx_simdcall sinCosSingleAccuracy(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
5096 {
5097     sincos(x, sinval, cosval);
5098 }
5099
5100 /*! \brief SIMD float sin(x), only targeting single accuracy.
5101  *
5102  * \param x The argument to evaluate sin for
5103  * \result Sin(x)
5104  *
5105  * \attention Do NOT call both sin & cos if you need both results, since each of them
5106  * will then call \ref sincos and waste a factor 2 in performance.
5107  */
5108 static inline SimdFloat gmx_simdcall sinSingleAccuracy(SimdFloat x)
5109 {
5110     return sin(x);
5111 }
5112
5113 /*! \brief SIMD float cos(x), only targeting single accuracy.
5114  *
5115  * \param x The argument to evaluate cos for
5116  * \result Cos(x)
5117  *
5118  * \attention Do NOT call both sin & cos if you need both results, since each of them
5119  * will then call \ref sincos and waste a factor 2 in performance.
5120  */
5121 static inline SimdFloat gmx_simdcall cosSingleAccuracy(SimdFloat x)
5122 {
5123     return cos(x);
5124 }
5125
5126 /*! \brief SIMD float tan(x), only targeting single accuracy.
5127  *
5128  * \param x The argument to evaluate tan for
5129  * \result Tan(x)
5130  */
5131 static inline SimdFloat gmx_simdcall tanSingleAccuracy(SimdFloat x)
5132 {
5133     return tan(x);
5134 }
5135
5136 /*! \brief SIMD float asin(x), only targeting single accuracy.
5137  *
5138  * \param x The argument to evaluate asin for
5139  * \result Asin(x)
5140  */
5141 static inline SimdFloat gmx_simdcall asinSingleAccuracy(SimdFloat x)
5142 {
5143     return asin(x);
5144 }
5145
5146 /*! \brief SIMD float acos(x), only targeting single accuracy.
5147  *
5148  * \param x The argument to evaluate acos for
5149  * \result Acos(x)
5150  */
5151 static inline SimdFloat gmx_simdcall acosSingleAccuracy(SimdFloat x)
5152 {
5153     return acos(x);
5154 }
5155
5156 /*! \brief SIMD float atan(x), only targeting single accuracy.
5157  *
5158  * \param x The argument to evaluate atan for
5159  * \result Atan(x), same argument/value range as standard math library.
5160  */
5161 static inline SimdFloat gmx_simdcall atanSingleAccuracy(SimdFloat x)
5162 {
5163     return atan(x);
5164 }
5165
5166 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5167  *
5168  * \param y Y component of vector, any quartile
5169  * \param x X component of vector, any quartile
5170  * \result Atan(y,x), same argument/value range as standard math library.
5171  *
5172  * \note This routine should provide correct results for all finite
5173  * non-zero or positive-zero arguments. However, negative zero arguments will
5174  * be treated as positive zero, which means the return value will deviate from
5175  * the standard math library atan2(y,x) for those cases. That should not be
5176  * of any concern in Gromacs, and in particular it will not affect calculations
5177  * of angles from vectors.
5178  */
5179 static inline SimdFloat gmx_simdcall atan2SingleAccuracy(SimdFloat y, SimdFloat x)
5180 {
5181     return atan2(y, x);
5182 }
5183
5184 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5185  *
5186  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5187  * \result Correction factor to coulomb force.
5188  */
5189 static inline SimdFloat gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdFloat z2)
5190 {
5191     return pmeForceCorrection(z2);
5192 }
5193
5194 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5195  *
5196  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5197  * \result Correction factor to coulomb force.
5198  */
5199 static inline SimdFloat gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
5200 {
5201     return pmePotentialCorrection(z2);
5202 }
5203 #    endif // GMX_SIMD_HAVE_FLOAT
5204
5205 #    if GMX_SIMD4_HAVE_FLOAT
5206 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5207  *
5208  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5209  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
5210  *           For the single precision implementation this is obviously always
5211  *           true for positive values, but for double precision it adds an
5212  *           extra restriction since the first lookup step might have to be
5213  *           performed in single precision on some architectures. Note that the
5214  *           responsibility for checking falls on you - this routine does not
5215  *           check arguments.
5216  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
5217  */
5218 static inline Simd4Float gmx_simdcall invsqrtSingleAccuracy(Simd4Float x)
5219 {
5220     return invsqrt(x);
5221 }
5222 #    endif // GMX_SIMD4_HAVE_FLOAT
5223
5224 /*! \}   end of addtogroup module_simd */
5225 /*! \endcond  end of condition libabl */
5226
5227 #endif // GMX_SIMD
5228
5229 } // namespace gmx
5230
5231 #endif // GMX_SIMD_SIMD_MATH_H