src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #ifndef GMX_SIMD_SIMD_MATH_H
  37 #define GMX_SIMD_SIMD_MATH_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Math functions for SIMD datatypes.
  42  *
  43  * \attention This file is generic for all SIMD architectures, so you cannot
  44  * assume that any of the optional SIMD features (as defined in simd.h) are
  45  * present. In particular, this means you cannot assume support for integers,
  46  * logical operations (neither on floating-point nor integer values), shifts,
  47  * and the architecture might only have SIMD for either float or double.
  48  * Second, to keep this file clean and general, any additions to this file
  49  * must work for all possible SIMD architectures in both single and double
  50  * precision (if they support it), and you cannot make any assumptions about
  51  * SIMD width.
  52  *
  53  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  54  *
  55  * \inlibraryapi
  56  * \ingroup module_simd
  57  */
  58
  59 #include "config.h"
  60
  61 #include <cmath>
  62
  63 #include <limits>
  64
  65 #include "gromacs/math/utilities.h"
  66 #include "gromacs/simd/simd.h"
  67 #include "gromacs/utility/basedefinitions.h"
  68 #include "gromacs/utility/real.h"
  69
  70 namespace gmx
  71 {
  72
  73 #if GMX_SIMD
  74
  75 /*! \cond libapi */
  76 /*! \addtogroup module_simd */
  77 /*! \{ */
  78
  79 /*! \name Implementation accuracy settings
  80  *  \{
  81  */
  82
  83 /*! \} */
  84
  85 #    if GMX_SIMD_HAVE_FLOAT
  86
  87 /*! \name Single precision SIMD math functions
  88  *
  89  *  \note In most cases you should use the real-precision functions instead.
  90  *  \{
  91  */
  92
  93 /****************************************
  94  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  95  ****************************************/
  96
  97 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
  98 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
  99  *
 100  * \param x Values to set sign for
 101  * \param y Values used to set sign
 102  * \return  Magnitude of x, sign of y
 103  */
 104 static inline SimdFloat gmx_simdcall copysign(SimdFloat x, SimdFloat y)
 105 {
 106 #            if GMX_SIMD_HAVE_LOGICAL
 107     return abs(x) | (SimdFloat(GMX_FLOAT_NEGZERO) & y);
 108 #            else
 109     return blend(abs(x), -abs(x), y < setZero());
 110 #            endif
 111 }
 112 #        endif
 113
 114 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
 115 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 116  *
 117  * This is a low-level routine that should only be used by SIMD math routine
 118  * that evaluates the inverse square root.
 119  *
 120  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 121  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 122  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 123  */
 124 static inline SimdFloat gmx_simdcall rsqrtIter(SimdFloat lu, SimdFloat x)
 125 {
 126     SimdFloat tmp1 = x * lu;
 127     SimdFloat tmp2 = SimdFloat(-0.5F) * lu;
 128     tmp1           = fma(tmp1, lu, SimdFloat(-3.0F));
 129     return tmp1 * tmp2;
 130 }
 131 #        endif
 132
 133 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 134  *
 135  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 136  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 137  *           For the single precision implementation this is obviously always
 138  *           true for positive values, but for double precision it adds an
 139  *           extra restriction since the first lookup step might have to be
 140  *           performed in single precision on some architectures. Note that the
 141  *           responsibility for checking falls on you - this routine does not
 142  *           check arguments.
 143  *
 144  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 145  */
 146 static inline SimdFloat gmx_simdcall invsqrt(SimdFloat x)
 147 {
 148     SimdFloat lu = rsqrt(x);
 149 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 150     lu = rsqrtIter(lu, x);
 151 #        endif
 152 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 153     lu = rsqrtIter(lu, x);
 154 #        endif
 155 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 156     lu = rsqrtIter(lu, x);
 157 #        endif
 158     return lu;
 159 }
 160
 161 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 162  *
 163  * \param x0  First set of arguments, x0 must be in single range (see below).
 164  * \param x1  Second set of arguments, x1 must be in single range (see below).
 165  * \param[out] out0  Result 1/sqrt(x0)
 166  * \param[out] out1  Result 1/sqrt(x1)
 167  *
 168  *  In particular for double precision we can sometimes calculate square root
 169  *  pairs slightly faster by using single precision until the very last step.
 170  *
 171  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
 172  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
 173  *       For the single precision implementation this is obviously always
 174  *       true for positive values, but for double precision it adds an
 175  *       extra restriction since the first lookup step might have to be
 176  *       performed in single precision on some architectures. Note that the
 177  *       responsibility for checking falls on you - this routine does not
 178  *       check arguments.
 179  */
 180 static inline void gmx_simdcall invsqrtPair(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
 181 {
 182     *out0 = invsqrt(x0);
 183     *out1 = invsqrt(x1);
 184 }
 185
 186 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
 187 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 188  *
 189  * This is a low-level routine that should only be used by SIMD math routine
 190  * that evaluates the reciprocal.
 191  *
 192  *  \param lu Approximation of 1/x, typically obtained from lookup.
 193  *  \param x  The reference (starting) value x for which we want 1/x.
 194  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 195  */
 196 static inline SimdFloat gmx_simdcall rcpIter(SimdFloat lu, SimdFloat x)
 197 {
 198     return lu * fnma(lu, x, SimdFloat(2.0F));
 199 }
 200 #        endif
 201
 202 /*! \brief Calculate 1/x for SIMD float.
 203  *
 204  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 205  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
 206  *           For the single precision implementation this is obviously always
 207  *           true for positive values, but for double precision it adds an
 208  *           extra restriction since the first lookup step might have to be
 209  *           performed in single precision on some architectures. Note that the
 210  *           responsibility for checking falls on you - this routine does not
 211  *           check arguments.
 212  *
 213  *  \return 1/x. Result is undefined if your argument was invalid.
 214  */
 215 static inline SimdFloat gmx_simdcall inv(SimdFloat x)
 216 {
 217     SimdFloat lu = rcp(x);
 218 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 219     lu = rcpIter(lu, x);
 220 #        endif
 221 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 222     lu = rcpIter(lu, x);
 223 #        endif
 224 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 225     lu = rcpIter(lu, x);
 226 #        endif
 227     return lu;
 228 }
 229
 230 /*! \brief Division for SIMD floats
 231  *
 232  * \param nom    Nominator
 233  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
 234  *               For single precision this is equivalent to a nonzero argument,
 235  *               but in double precision it adds an extra restriction since
 236  *               the first lookup step might have to be performed in single
 237  *               precision on some architectures. Note that the responsibility
 238  *               for checking falls on you - this routine does not check arguments.
 239  *
 240  * \return nom/denom
 241  *
 242  * \note This function does not use any masking to avoid problems with
 243  *       zero values in the denominator.
 244  */
 245 static inline SimdFloat gmx_simdcall operator/(SimdFloat nom, SimdFloat denom)
 246 {
 247     return nom * inv(denom);
 248 }
 249
 250 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
 251  *
 252  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
 253  *  Illegal values in the masked-out elements will not lead to
 254  *  floating-point exceptions.
 255  *
 256  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
 257  *           GMX_FLOAT_MAX for masked-in entries.
 258  *           See \ref invsqrt for the discussion about argument restrictions.
 259  *  \param m Mask
 260  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
 261  *          entry was not masked, and 0.0 for masked-out entries.
 262  */
 263 static inline SimdFloat maskzInvsqrt(SimdFloat x, SimdFBool m)
 264 {
 265     SimdFloat lu = maskzRsqrt(x, m);
 266 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 267     lu = rsqrtIter(lu, x);
 268 #        endif
 269 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 270     lu = rsqrtIter(lu, x);
 271 #        endif
 272 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 273     lu = rsqrtIter(lu, x);
 274 #        endif
 275     return lu;
 276 }
 277
 278 /*! \brief Calculate 1/x for SIMD float, masked version.
 279  *
 280  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
 281  *           GMX_FLOAT_MAX for masked-in entries.
 282  *           See \ref invsqrt for the discussion about argument restrictions.
 283  *  \param m Mask
 284  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
 285  */
 286 static inline SimdFloat gmx_simdcall maskzInv(SimdFloat x, SimdFBool m)
 287 {
 288     SimdFloat lu = maskzRcp(x, m);
 289 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
 290     lu = rcpIter(lu, x);
 291 #        endif
 292 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 293     lu = rcpIter(lu, x);
 294 #        endif
 295 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
 296     lu = rcpIter(lu, x);
 297 #        endif
 298     return lu;
 299 }
 300
 301 /*! \brief Calculate sqrt(x) for SIMD floats
 302  *
 303  *  \tparam opt By default, this function checks if the input value is 0.0
 304  *              and masks this to return the correct result. If you are certain
 305  *              your argument will never be zero, and you know you need to save
 306  *              every single cycle you can, you can alternatively call the
 307  *              function as sqrt<MathOptimization::Unsafe>(x).
 308  *
 309  *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
 310  *              lookup step often has to be implemented in single precision.
 311  *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
 312  *              result, even in double precision. If you are using the unsafe
 313  *              math optimization parameter, the argument must be in the range
 314  *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
 315  *
 316  *  \return sqrt(x). The result is undefined if the input value does not fall
 317  *          in the allowed range specified for the argument.
 318  */
 319 template<MathOptimization opt = MathOptimization::Safe>
 320 static inline SimdFloat gmx_simdcall sqrt(SimdFloat x)
 321 {
 322     if (opt == MathOptimization::Safe)
 323     {
 324         SimdFloat res = maskzInvsqrt(x, setZero() < x);
 325         return res * x;
 326     }
 327     else
 328     {
 329         return x * invsqrt(x);
 330     }
 331 }
 332
 333 /*! \brief Cube root for SIMD floats
 334  *
 335  * \param x      Argument to calculate cube root of. Can be negative or zero,
 336  *               but NaN or Inf values are not supported. Denormal values will
 337  *               be treated as 0.0.
 338  * \return       Cube root of x.
 339  */
 340 static inline SimdFloat gmx_simdcall cbrt(SimdFloat x)
 341 {
 342     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 343     const SimdFloat minFloat(std::numeric_limits<float>::min());
 344     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 345     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 346     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 347     // division mixed with FP, we let the divided value (42) be the original constant.
 348     const std::int32_t offsetDiv3(42);
 349     const SimdFloat    c2(-0.191502161678719066F);
 350     const SimdFloat    c1(0.697570460207922770F);
 351     const SimdFloat    c0(0.492659620528969547F);
 352     const SimdFloat    one(1.0F);
 353     const SimdFloat    two(2.0F);
 354     const SimdFloat    three(3.0F);
 355     const SimdFloat    oneThird(1.0F / 3.0F);
 356     const SimdFloat    cbrt2(1.2599210498948731648F);
 357     const SimdFloat    sqrCbrt2(1.5874010519681994748F);
 358
 359     // To calculate cbrt(x) we first take the absolute value of x but save the sign,
 360     // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
 361     // the main step.
 362     // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
 363     // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
 364     // The cube root can the be evaluated by calculating the cube root of the fraction
 365     // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
 366     // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
 367     // n is the integer part of the original exponent divided by 3.
 368
 369     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 370     SimdFloat xAbs     = andNot(signBit, x);   // select everthing but the sign bit => abs(x)
 371     SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
 372
 373     SimdFInt32 exponent;
 374     SimdFloat  y = frexp(xAbs, &exponent);
 375     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 376     // by first using a polynomial and then evaluating
 377     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 378     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 379     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 380     SimdFloat w        = z * z * z;
 381     SimdFloat nom      = z * fma(two, y, w);
 382     SimdFloat invDenom = inv(fma(two, w, y));
 383
 384     // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
 385     // division converted to multiplication... but we can't do that since our SIMD layer cannot
 386     // assume the presence of integer shift operations!
 387     // However, when I first worked with the integer algorithm I still came up with a neat
 388     // optimization, so I'll describe the full algorithm here in case we ever want to use it
 389     // in the future:
 390     //
 391     // Our dividend is signed, which is a complication, but let's consider the unsigned case
 392     // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
 393     // our dividend is less than 16 bits (exponent range) we can accomplish this by
 394     // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
 395     // right-shifting by 16 bits to divide out the 2^16 part.
 396     // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
 397     // To handle the signed exponent one alternative would be to take absolute values, saving
 398     // signs, etc - but that gets a bit complicated with 2-complement integers.
 399     // Instead, we remember that we don't really want the exact division per se - what we're
 400     // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
 401     // we require that m must be positive (fewer cases to handle) instead of having n as the
 402     // strict e/3.
 403     // To handle this we start by adding 127 to the exponent. This value corresponds to the
 404     // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
 405     // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
 406     // and then multiply by 21845 to get a division result offsetExpDiv3.
 407     // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
 408     // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
 409     // the extra 1 corrects for the value we added to the exponent to get correct division.
 410     // This remainder would have the value 0,1, or 2, but since we only use it to select
 411     // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
 412     //
 413     // OK; end of long detour. Here's how we actually do it in our implementation by using
 414     // floating-point for the exponent instead to avoid needing integer shifts:
 415     //
 416     // 1) Convert the exponent (obtained from frexp) to a float
 417     // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
 418     //    do floating-point division instead of our integer hack, so it's the exponent bias-1, or
 419     //    the largest exponent minus 2.
 420     // 3) Divide the float by 3 by multiplying with 1/3
 421     // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
 422     //    combination with floating-point, because many integers cannot be represented exactly in
 423     //    floating point, and if we are just epsilon below the result might be truncated to a lower
 424     //    integer. I have not observed this on x86, but to have a safety margin we can add a small
 425     //    fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
 426     //    We can even save this extra floating-point addition by adding a small fraction (0.1) when
 427     //    we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
 428     // 5) Get the remainder part by subtracting the truncated floating-point part.
 429     //    Here too we will have a plain division, so the remainder is a strict modulus
 430     //    and will have the values 0, 1 or 2.
 431     //
 432     // Before worrying about the few wasted cycles due to longer fp latency, this has the
 433     // additional advantage that we don't use a single integer operation, so the algorithm
 434     // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
 435
 436     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 437     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 438
 439     SimdFloat offsetExpDiv3 =
 440             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 441
 442     SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offsetDiv3)));
 443
 444     SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
 445
 446     // If remainder is 0 we should just have the factor 1.0,
 447     // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
 448     SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
 449     // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
 450     factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
 451
 452     // Assemble the non-signed fraction, and add the sign back by xor
 453     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 454     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 455     SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
 456
 457     return result;
 458 }
 459
 460 /*! \brief Inverse cube root for SIMD floats
 461  *
 462  * \param x      Argument to calculate cube root of. Can be positive or
 463  *               negative, but the magnitude cannot be lower than
 464  *               the smallest normal number.
 465  * \return       Cube root of x. Undefined for values that don't
 466  *               fulfill the restriction of abs(x) > minFloat.
 467  */
 468 static inline SimdFloat gmx_simdcall invcbrt(SimdFloat x)
 469 {
 470     const SimdFloat signBit(GMX_FLOAT_NEGZERO);
 471     const SimdFloat minFloat(std::numeric_limits<float>::min());
 472     // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
 473     // negative exponent from frexp() is -126, we can subtract one more unit to get 126
 474     // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
 475     // division mixed with FP, we let the divided value (42) be the original constant.
 476     const std::int32_t offsetDiv3(42);
 477     const SimdFloat    c2(-0.191502161678719066F);
 478     const SimdFloat    c1(0.697570460207922770F);
 479     const SimdFloat    c0(0.492659620528969547F);
 480     const SimdFloat    one(1.0F);
 481     const SimdFloat    two(2.0F);
 482     const SimdFloat    three(3.0F);
 483     const SimdFloat    oneThird(1.0F / 3.0F);
 484     const SimdFloat    invCbrt2(1.0F / 1.2599210498948731648F);
 485     const SimdFloat    invSqrCbrt2(1.0F / 1.5874010519681994748F);
 486
 487     // We use pretty much exactly the same implementation as for cbrt(x),
 488     // but to compute the inverse we swap the nominator/denominator
 489     // in the quotient, and also swap the sign of the exponent parts.
 490
 491     SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
 492     SimdFloat xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
 493
 494     SimdFInt32 exponent;
 495     SimdFloat  y = frexp(xAbs, &exponent);
 496     // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
 497     // by first using a polynomial and then evaluating
 498     // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
 499     // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
 500     SimdFloat z        = fma(fma(y, c2, c1), y, c0);
 501     SimdFloat w        = z * z * z;
 502     SimdFloat nom      = fma(two, w, y);
 503     SimdFloat invDenom = inv(z * fma(two, y, w));
 504
 505     // The  0.1 here is the safety margin due to  truncation described in item 4 in the comments above.
 506     SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
 507     SimdFloat offsetExpDiv3 =
 508             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
 509
 510     // We should swap the sign here, so we change order of the terms in the subtraction
 511     SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offsetDiv3)) - offsetExpDiv3);
 512
 513     // Swap sign here too, so remainder is either 0, -1 or -2
 514     SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
 515
 516     // If remainder is 0 we should just have the factor 1.0,
 517     // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
 518     SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5));
 519     // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
 520     factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
 521
 522     // Assemble the non-signed fraction, and add the sign back by xor
 523     SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
 524     // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
 525     SimdFloat result = ldexp(fraction, expDiv3);
 526
 527     return result;
 528 }
 529
 530 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
 531  *
 532  * \param x Argument, should be >0.
 533  * \result The base-2 logarithm of x. Undefined if argument is invalid.
 534  */
 535 static inline SimdFloat gmx_simdcall log2(SimdFloat x)
 536 {
 537     // This implementation computes log2 by
 538     // 1) Extracting the exponent and adding it to...
 539     // 2) A 9th-order minimax approximation using only odd
 540     //    terms of (x-1)/(x+1), where x is the mantissa.
 541
 542 #        if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 543     // Just rescale if native log2() is not present, but log() is.
 544     return log(x) * SimdFloat(std::log2(std::exp(1.0)));
 545 #        else
 546     const SimdFloat one(1.0F);
 547     const SimdFloat two(2.0F);
 548     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 549     const SimdFloat CL9(0.342149508897807708152F);
 550     const SimdFloat CL7(0.411570606888219447939F);
 551     const SimdFloat CL5(0.577085979152320294183F);
 552     const SimdFloat CL3(0.961796550607099898222F);
 553     const SimdFloat CL1(2.885390081777926774009F);
 554     SimdFloat       fExp, x2, p;
 555     SimdFBool       m;
 556     SimdFInt32      iExp;
 557
 558     x    = frexp(x, &iExp);
 559     fExp = cvtI2R(iExp);
 560
 561     m = x < invsqrt2;
 562     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 563     fExp = fExp - selectByMask(one, m);
 564     x    = x * blend(one, two, m);
 565
 566     x  = (x - one) * inv(x + one);
 567     x2 = x * x;
 568
 569     p = fma(CL9, x2, CL7);
 570     p = fma(p, x2, CL5);
 571     p = fma(p, x2, CL3);
 572     p = fma(p, x2, CL1);
 573     p = fma(p, x, fExp);
 574
 575     return p;
 576 #        endif
 577 }
 578
 579 #        if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
 580 /*! \brief SIMD float log(x). This is the natural logarithm.
 581  *
 582  * \param x Argument, should be >0.
 583  * \result The natural logarithm of x. Undefined if argument is invalid.
 584  */
 585 static inline SimdFloat gmx_simdcall log(SimdFloat x)
 586 {
 587     const SimdFloat one(1.0F);
 588     const SimdFloat two(2.0F);
 589     const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
 590     const SimdFloat corr(0.693147180559945286226764F);
 591     const SimdFloat CL9(0.2371599674224853515625F);
 592     const SimdFloat CL7(0.285279005765914916992188F);
 593     const SimdFloat CL5(0.400005519390106201171875F);
 594     const SimdFloat CL3(0.666666567325592041015625F);
 595     const SimdFloat CL1(2.0F);
 596     SimdFloat       fExp, x2, p;
 597     SimdFBool       m;
 598     SimdFInt32      iExp;
 599
 600     x    = frexp(x, &iExp);
 601     fExp = cvtI2R(iExp);
 602
 603     m = x < invsqrt2;
 604     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
 605     fExp = fExp - selectByMask(one, m);
 606     x    = x * blend(one, two, m);
 607
 608     x  = (x - one) * inv(x + one);
 609     x2 = x * x;
 610
 611     p = fma(CL9, x2, CL7);
 612     p = fma(p, x2, CL5);
 613     p = fma(p, x2, CL3);
 614     p = fma(p, x2, CL1);
 615     p = fma(p, x, corr * fExp);
 616
 617     return p;
 618 }
 619 #        endif
 620
 621 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
 622 /*! \brief SIMD float 2^x
 623  *
 624  * \tparam opt If this is changed from the default (safe) into the unsafe
 625  *             option, input values that would otherwise lead to zero-clamped
 626  *             results are not allowed and will lead to undefined results.
 627  *
 628  * \param x Argument. For the default (safe) function version this can be
 629  *          arbitrarily small value, but the routine might clamp the result to
 630  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 631  *          This corresponds to inputs below -126 in single or -1022 in double,
 632  *          and it might overflow for arguments reaching 127 (single) or
 633  *          1023 (double). If you enable the unsafe math optimization,
 634  *          very small arguments will not necessarily be zero-clamped, but
 635  *          can produce undefined results.
 636  *
 637  * \result 2^x. The result is undefined for very large arguments that cause
 638  *          internal floating-point overflow. If unsafe optimizations are enabled,
 639  *          this is also true for very small values.
 640  *
 641  * \note    The definition range of this function is just-so-slightly smaller
 642  *          than the allowed IEEE exponents for many architectures. This is due
 643  *          to the implementation, which will hopefully improve in the future.
 644  *
 645  * \warning You cannot rely on this implementation returning inf for arguments
 646  *          that cause overflow. If you have some very large
 647  *          values and need to rely on getting a valid numerical output,
 648  *          take the minimum of your variable and the largest valid argument
 649  *          before calling this routine.
 650  */
 651 template<MathOptimization opt = MathOptimization::Safe>
 652 static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
 653 {
 654     const SimdFloat CC6(0.0001534581200287996416911311F);
 655     const SimdFloat CC5(0.001339993121934088894618990F);
 656     const SimdFloat CC4(0.009618488957115180159497841F);
 657     const SimdFloat CC3(0.05550328776964726865751735F);
 658     const SimdFloat CC2(0.2402264689063408646490722F);
 659     const SimdFloat CC1(0.6931472057372680777553816F);
 660     const SimdFloat one(1.0F);
 661
 662     SimdFloat intpart;
 663     SimdFloat fexppart;
 664     SimdFloat p;
 665
 666     // Large negative values are valid arguments to exp2(), so there are two
 667     // things we need to account for:
 668     // 1. When the exponents reaches -127, the (biased) exponent field will be
 669     //    zero and we can no longer multiply with it. There are special IEEE
 670     //    formats to handle this range, but for now we have to accept that
 671     //    we cannot handle those arguments. If input value becomes even more
 672     //    negative, it will start to loop and we would end up with invalid
 673     //    exponents. Thus, we need to limit or mask this.
 674     // 2. For VERY large negative values, we will have problems that the
 675     //    subtraction to get the fractional part loses accuracy, and then we
 676     //    can end up with overflows in the polynomial.
 677     //
 678     // For now, we handle this by forwarding the math optimization setting to
 679     // ldexp, where the routine will return zero for very small arguments.
 680     //
 681     // However, before doing that we need to make sure we do not call cvtR2I
 682     // with an argument that is so negative it cannot be converted to an integer.
 683     if (opt == MathOptimization::Safe)
 684     {
 685         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
 686     }
 687
 688     fexppart = ldexp<opt>(one, cvtR2I(x));
 689     intpart  = round(x);
 690     x        = x - intpart;
 691
 692     p = fma(CC6, x, CC5);
 693     p = fma(p, x, CC4);
 694     p = fma(p, x, CC3);
 695     p = fma(p, x, CC2);
 696     p = fma(p, x, CC1);
 697     p = fma(p, x, one);
 698     x = p * fexppart;
 699     return x;
 700 }
 701 #        endif
 702
 703 #        if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
 704 /*! \brief SIMD float exp(x).
 705  *
 706  * In addition to scaling the argument for 2^x this routine correctly does
 707  * extended precision arithmetics to improve accuracy.
 708  *
 709  * \tparam opt If this is changed from the default (safe) into the unsafe
 710  *             option, input values that would otherwise lead to zero-clamped
 711  *             results are not allowed and will lead to undefined results.
 712  *
 713  * \param x Argument. For the default (safe) function version this can be
 714  *          arbitrarily small value, but the routine might clamp the result to
 715  *          zero for arguments that would produce subnormal IEEE754-2008 results.
 716  *          This corresponds to input arguments reaching
 717  *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
 718  *          Similarly, it might overflow for arguments reaching
 719  *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
 720  *          unsafe math optimizations are enabled, small input values that would
 721  *          result in zero-clamped output are not allowed.
 722  *
 723  * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
 724  *         depending on the underlying implementation. If unsafe optimizations
 725  *         are enabled, this is also true for very small values.
 726  *
 727  * \note    The definition range of this function is just-so-slightly smaller
 728  *          than the allowed IEEE exponents for many architectures. This is due
 729  *          to the implementation, which will hopefully improve in the future.
 730  *
 731  * \warning You cannot rely on this implementation returning inf for arguments
 732  *          that cause overflow. If you have some very large
 733  *          values and need to rely on getting a valid numerical output,
 734  *          take the minimum of your variable and the largest valid argument
 735  *          before calling this routine.
 736  */
 737 template<MathOptimization opt = MathOptimization::Safe>
 738 static inline SimdFloat gmx_simdcall exp(SimdFloat x)
 739 {
 740     const SimdFloat argscale(1.44269504088896341F);
 741     const SimdFloat invargscale0(-0.693145751953125F);
 742     const SimdFloat invargscale1(-1.428606765330187045e-06F);
 743     const SimdFloat CC4(0.00136324646882712841033936F);
 744     const SimdFloat CC3(0.00836596917361021041870117F);
 745     const SimdFloat CC2(0.0416710823774337768554688F);
 746     const SimdFloat CC1(0.166665524244308471679688F);
 747     const SimdFloat CC0(0.499999850988388061523438F);
 748     const SimdFloat one(1.0F);
 749     SimdFloat       fexppart;
 750     SimdFloat       intpart;
 751     SimdFloat       y, p;
 752
 753     // Large negative values are valid arguments to exp2(), so there are two
 754     // things we need to account for:
 755     // 1. When the exponents reaches -127, the (biased) exponent field will be
 756     //    zero and we can no longer multiply with it. There are special IEEE
 757     //    formats to handle this range, but for now we have to accept that
 758     //    we cannot handle those arguments. If input value becomes even more
 759     //    negative, it will start to loop and we would end up with invalid
 760     //    exponents. Thus, we need to limit or mask this.
 761     // 2. For VERY large negative values, we will have problems that the
 762     //    subtraction to get the fractional part loses accuracy, and then we
 763     //    can end up with overflows in the polynomial.
 764     //
 765     // For now, we handle this by forwarding the math optimization setting to
 766     // ldexp, where the routine will return zero for very small arguments.
 767     //
 768     // However, before doing that we need to make sure we do not call cvtR2I
 769     // with an argument that is so negative it cannot be converted to an integer
 770     // after being multiplied by argscale.
 771
 772     if (opt == MathOptimization::Safe)
 773     {
 774         x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()) / argscale);
 775     }
 776
 777     y = x * argscale;
 778
 779
 780     fexppart = ldexp<opt>(one, cvtR2I(y));
 781     intpart  = round(y);
 782
 783     // Extended precision arithmetics
 784     x = fma(invargscale0, intpart, x);
 785     x = fma(invargscale1, intpart, x);
 786
 787     p = fma(CC4, x, CC3);
 788     p = fma(p, x, CC2);
 789     p = fma(p, x, CC1);
 790     p = fma(p, x, CC0);
 791     p = fma(x * x, p, x);
 792 #            if GMX_SIMD_HAVE_FMA
 793     x = fma(p, fexppart, fexppart);
 794 #            else
 795     x = (p + one) * fexppart;
 796 #            endif
 797     return x;
 798 }
 799 #        endif
 800
 801 /*! \brief SIMD float pow(x,y)
 802  *
 803  * This returns x^y for SIMD values.
 804  *
 805  * \tparam opt If this is changed from the default (safe) into the unsafe
 806  *             option, there are no guarantees about correct results for x==0.
 807  *
 808  * \param x Base.
 809  *
 810  * \param y exponent.
 811
 812  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
 813  *         depending on the underlying implementation. If unsafe optimizations
 814  *         are enabled, this is also true for x==0.
 815  *
 816  * \warning You cannot rely on this implementation returning inf for arguments
 817  *          that cause overflow. If you have some very large
 818  *          values and need to rely on getting a valid numerical output,
 819  *          take the minimum of your variable and the largest valid argument
 820  *          before calling this routine.
 821  */
 822 template<MathOptimization opt = MathOptimization::Safe>
 823 static inline SimdFloat gmx_simdcall pow(SimdFloat x, SimdFloat y)
 824 {
 825     SimdFloat xcorr;
 826
 827     if (opt == MathOptimization::Safe)
 828     {
 829         xcorr = max(x, SimdFloat(std::numeric_limits<float>::min()));
 830     }
 831     else
 832     {
 833         xcorr = x;
 834     }
 835
 836     SimdFloat result = exp2<opt>(y * log2(xcorr));
 837
 838     if (opt == MathOptimization::Safe)
 839     {
 840         // if x==0 and y>0 we explicitly set the result to 0.0
 841         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
 842         result = blend(result, setZero(), x == setZero() && setZero() < y);
 843     }
 844
 845     return result;
 846 }
 847
 848
 849 /*! \brief SIMD float erf(x).
 850  *
 851  * \param x The value to calculate erf(x) for.
 852  * \result erf(x)
 853  *
 854  * This routine achieves very close to full precision, but we do not care about
 855  * the last bit or the subnormal result range.
 856  */
 857 static inline SimdFloat gmx_simdcall erf(SimdFloat x)
 858 {
 859     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 860     const SimdFloat CA6(7.853861353153693e-5F);
 861     const SimdFloat CA5(-8.010193625184903e-4F);
 862     const SimdFloat CA4(5.188327685732524e-3F);
 863     const SimdFloat CA3(-2.685381193529856e-2F);
 864     const SimdFloat CA2(1.128358514861418e-1F);
 865     const SimdFloat CA1(-3.761262582423300e-1F);
 866     const SimdFloat CA0(1.128379165726710F);
 867     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 868     const SimdFloat CB9(-0.0018629930017603923F);
 869     const SimdFloat CB8(0.003909821287598495F);
 870     const SimdFloat CB7(-0.0052094582210355615F);
 871     const SimdFloat CB6(0.005685614362160572F);
 872     const SimdFloat CB5(-0.0025367682853477272F);
 873     const SimdFloat CB4(-0.010199799682318782F);
 874     const SimdFloat CB3(0.04369575504816542F);
 875     const SimdFloat CB2(-0.11884063474674492F);
 876     const SimdFloat CB1(0.2732120154030589F);
 877     const SimdFloat CB0(0.42758357702025784F);
 878     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 879     const SimdFloat CC10(-0.0445555913112064F);
 880     const SimdFloat CC9(0.21376355144663348F);
 881     const SimdFloat CC8(-0.3473187200259257F);
 882     const SimdFloat CC7(0.016690861551248114F);
 883     const SimdFloat CC6(0.7560973182491192F);
 884     const SimdFloat CC5(-1.2137903600145787F);
 885     const SimdFloat CC4(0.8411872321232948F);
 886     const SimdFloat CC3(-0.08670413896296343F);
 887     const SimdFloat CC2(-0.27124782687240334F);
 888     const SimdFloat CC1(-0.0007502488047806069F);
 889     const SimdFloat CC0(0.5642114853803148F);
 890     const SimdFloat one(1.0F);
 891     const SimdFloat two(2.0F);
 892
 893     SimdFloat x2, x4, y;
 894     SimdFloat t, t2, w, w2;
 895     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
 896     SimdFloat expmx2;
 897     SimdFloat res_erf, res_erfc, res;
 898     SimdFBool m, maskErf;
 899
 900     // Calculate erf()
 901     x2 = x * x;
 902     x4 = x2 * x2;
 903
 904     pA0 = fma(CA6, x4, CA4);
 905     pA1 = fma(CA5, x4, CA3);
 906     pA0 = fma(pA0, x4, CA2);
 907     pA1 = fma(pA1, x4, CA1);
 908     pA0 = pA0 * x4;
 909     pA0 = fma(pA1, x2, pA0);
 910     // Constant term must come last for precision reasons
 911     pA0 = pA0 + CA0;
 912
 913     res_erf = x * pA0;
 914
 915     // Calculate erfc
 916     y       = abs(x);
 917     maskErf = SimdFloat(0.75F) <= y;
 918     t       = maskzInv(y, maskErf);
 919     w       = t - one;
 920     t2      = t * t;
 921     w2      = w * w;
 922
 923     // No need for a floating-point sieve here (as in erfc), since erf()
 924     // will never return values that are extremely small for large args.
 925     expmx2 = exp(-y * y);
 926
 927     pB1 = fma(CB9, w2, CB7);
 928     pB0 = fma(CB8, w2, CB6);
 929     pB1 = fma(pB1, w2, CB5);
 930     pB0 = fma(pB0, w2, CB4);
 931     pB1 = fma(pB1, w2, CB3);
 932     pB0 = fma(pB0, w2, CB2);
 933     pB1 = fma(pB1, w2, CB1);
 934     pB0 = fma(pB0, w2, CB0);
 935     pB0 = fma(pB1, w, pB0);
 936
 937     pC0 = fma(CC10, t2, CC8);
 938     pC1 = fma(CC9, t2, CC7);
 939     pC0 = fma(pC0, t2, CC6);
 940     pC1 = fma(pC1, t2, CC5);
 941     pC0 = fma(pC0, t2, CC4);
 942     pC1 = fma(pC1, t2, CC3);
 943     pC0 = fma(pC0, t2, CC2);
 944     pC1 = fma(pC1, t2, CC1);
 945
 946     pC0 = fma(pC0, t2, CC0);
 947     pC0 = fma(pC1, t, pC0);
 948     pC0 = pC0 * t;
 949
 950     // Select pB0 or pC0 for erfc()
 951     m        = two < y;
 952     res_erfc = blend(pB0, pC0, m);
 953     res_erfc = res_erfc * expmx2;
 954
 955     // erfc(x<0) = 2-erfc(|x|)
 956     m        = x < setZero();
 957     res_erfc = blend(res_erfc, two - res_erfc, m);
 958
 959     // Select erf() or erfc()
 960     res = blend(res_erf, one - res_erfc, maskErf);
 961
 962     return res;
 963 }
 964
 965 /*! \brief SIMD float erfc(x).
 966  *
 967  * \param x The value to calculate erfc(x) for.
 968  * \result erfc(x)
 969  *
 970  * This routine achieves full precision (bar the last bit) over most of the
 971  * input range, but for large arguments where the result is getting close
 972  * to the minimum representable numbers we accept slightly larger errors
 973  * (think results that are in the ballpark of 10^-30 for single precision)
 974  * since that is not relevant for MD.
 975  */
 976 static inline SimdFloat gmx_simdcall erfc(SimdFloat x)
 977 {
 978     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
 979     const SimdFloat CA6(7.853861353153693e-5F);
 980     const SimdFloat CA5(-8.010193625184903e-4F);
 981     const SimdFloat CA4(5.188327685732524e-3F);
 982     const SimdFloat CA3(-2.685381193529856e-2F);
 983     const SimdFloat CA2(1.128358514861418e-1F);
 984     const SimdFloat CA1(-3.761262582423300e-1F);
 985     const SimdFloat CA0(1.128379165726710F);
 986     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
 987     const SimdFloat CB9(-0.0018629930017603923F);
 988     const SimdFloat CB8(0.003909821287598495F);
 989     const SimdFloat CB7(-0.0052094582210355615F);
 990     const SimdFloat CB6(0.005685614362160572F);
 991     const SimdFloat CB5(-0.0025367682853477272F);
 992     const SimdFloat CB4(-0.010199799682318782F);
 993     const SimdFloat CB3(0.04369575504816542F);
 994     const SimdFloat CB2(-0.11884063474674492F);
 995     const SimdFloat CB1(0.2732120154030589F);
 996     const SimdFloat CB0(0.42758357702025784F);
 997     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
 998     const SimdFloat CC10(-0.0445555913112064F);
 999     const SimdFloat CC9(0.21376355144663348F);
1000     const SimdFloat CC8(-0.3473187200259257F);
1001     const SimdFloat CC7(0.016690861551248114F);
1002     const SimdFloat CC6(0.7560973182491192F);
1003     const SimdFloat CC5(-1.2137903600145787F);
1004     const SimdFloat CC4(0.8411872321232948F);
1005     const SimdFloat CC3(-0.08670413896296343F);
1006     const SimdFloat CC2(-0.27124782687240334F);
1007     const SimdFloat CC1(-0.0007502488047806069F);
1008     const SimdFloat CC0(0.5642114853803148F);
1009     // Coefficients for expansion of exp(x) in [0,0.1]
1010     // CD0 and CD1 are both 1.0, so no need to declare them separately
1011     const SimdFloat CD2(0.5000066608081202F);
1012     const SimdFloat CD3(0.1664795422874624F);
1013     const SimdFloat CD4(0.04379839977652482F);
1014     const SimdFloat one(1.0F);
1015     const SimdFloat two(2.0F);
1016
1017     /* We need to use a small trick here, since we cannot assume all SIMD
1018      * architectures support integers, and the flag we want (0xfffff000) would
1019      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1020      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1021      * fp numbers, and perform a logical or. Since the expression is constant,
1022      * we can at least hope it is evaluated at compile-time.
1023      */
1024 #        if GMX_SIMD_HAVE_LOGICAL
1025     const SimdFloat sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
1026 #        else
1027     const int                         isieve = 0xFFFFF000;
1028     alignas(GMX_SIMD_ALIGNMENT) float mem[GMX_SIMD_FLOAT_WIDTH];
1029
1030     union {
1031         float f;
1032         int   i;
1033     } conv;
1034     int i;
1035 #        endif
1036
1037     SimdFloat x2, x4, y;
1038     SimdFloat q, z, t, t2, w, w2;
1039     SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
1040     SimdFloat expmx2, corr;
1041     SimdFloat res_erf, res_erfc, res;
1042     SimdFBool m, msk_erf;
1043
1044     // Calculate erf()
1045     x2 = x * x;
1046     x4 = x2 * x2;
1047
1048     pA0 = fma(CA6, x4, CA4);
1049     pA1 = fma(CA5, x4, CA3);
1050     pA0 = fma(pA0, x4, CA2);
1051     pA1 = fma(pA1, x4, CA1);
1052     pA1 = pA1 * x2;
1053     pA0 = fma(pA0, x4, pA1);
1054     // Constant term must come last for precision reasons
1055     pA0 = pA0 + CA0;
1056
1057     res_erf = x * pA0;
1058
1059     // Calculate erfc
1060     y       = abs(x);
1061     msk_erf = SimdFloat(0.75F) <= y;
1062     t       = maskzInv(y, msk_erf);
1063     w       = t - one;
1064     t2      = t * t;
1065     w2      = w * w;
1066     /*
1067      * We cannot simply calculate exp(-y2) directly in single precision, since
1068      * that will lose a couple of bits of precision due to the multiplication.
1069      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1070      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1071      *
1072      * The only drawback with this is that it requires TWO separate exponential
1073      * evaluations, which would be horrible performance-wise. However, the argument
1074      * for the second exp() call is always small, so there we simply use a
1075      * low-order minimax expansion on [0,0.1].
1076      *
1077      * However, this neat idea requires support for logical ops (and) on
1078      * FP numbers, which some vendors decided isn't necessary in their SIMD
1079      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1080      * in double, but we still need memory as a backup when that is not available,
1081      * and this case is rare enough that we go directly there...
1082      */
1083 #        if GMX_SIMD_HAVE_LOGICAL
1084     z = y & sieve;
1085 #        else
1086     store(mem, y);
1087     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1088     {
1089         conv.f = mem[i];
1090         conv.i = conv.i & isieve;
1091         mem[i] = conv.f;
1092     }
1093     z = load<SimdFloat>(mem);
1094 #        endif
1095     q    = (z - y) * (z + y);
1096     corr = fma(CD4, q, CD3);
1097     corr = fma(corr, q, CD2);
1098     corr = fma(corr, q, one);
1099     corr = fma(corr, q, one);
1100
1101     expmx2 = exp(-z * z);
1102     expmx2 = expmx2 * corr;
1103
1104     pB1 = fma(CB9, w2, CB7);
1105     pB0 = fma(CB8, w2, CB6);
1106     pB1 = fma(pB1, w2, CB5);
1107     pB0 = fma(pB0, w2, CB4);
1108     pB1 = fma(pB1, w2, CB3);
1109     pB0 = fma(pB0, w2, CB2);
1110     pB1 = fma(pB1, w2, CB1);
1111     pB0 = fma(pB0, w2, CB0);
1112     pB0 = fma(pB1, w, pB0);
1113
1114     pC0 = fma(CC10, t2, CC8);
1115     pC1 = fma(CC9, t2, CC7);
1116     pC0 = fma(pC0, t2, CC6);
1117     pC1 = fma(pC1, t2, CC5);
1118     pC0 = fma(pC0, t2, CC4);
1119     pC1 = fma(pC1, t2, CC3);
1120     pC0 = fma(pC0, t2, CC2);
1121     pC1 = fma(pC1, t2, CC1);
1122
1123     pC0 = fma(pC0, t2, CC0);
1124     pC0 = fma(pC1, t, pC0);
1125     pC0 = pC0 * t;
1126
1127     // Select pB0 or pC0 for erfc()
1128     m        = two < y;
1129     res_erfc = blend(pB0, pC0, m);
1130     res_erfc = res_erfc * expmx2;
1131
1132     // erfc(x<0) = 2-erfc(|x|)
1133     m        = x < setZero();
1134     res_erfc = blend(res_erfc, two - res_erfc, m);
1135
1136     // Select erf() or erfc()
1137     res = blend(one - res_erf, res_erfc, msk_erf);
1138
1139     return res;
1140 }
1141
1142 /*! \brief SIMD float sin \& cos.
1143  *
1144  * \param x The argument to evaluate sin/cos for
1145  * \param[out] sinval Sin(x)
1146  * \param[out] cosval Cos(x)
1147  *
1148  * This version achieves close to machine precision, but for very large
1149  * magnitudes of the argument we inherently begin to lose accuracy due to the
1150  * argument reduction, despite using extended precision arithmetics internally.
1151  */
1152 static inline void gmx_simdcall sincos(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
1153 {
1154     // Constants to subtract Pi/4*x from y while minimizing precision loss
1155     const SimdFloat argred0(-1.5703125);
1156     const SimdFloat argred1(-4.83751296997070312500e-04F);
1157     const SimdFloat argred2(-7.54953362047672271729e-08F);
1158     const SimdFloat argred3(-2.56334406825708960298e-12F);
1159     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1160     const SimdFloat const_sin2(-1.9515295891e-4F);
1161     const SimdFloat const_sin1(8.3321608736e-3F);
1162     const SimdFloat const_sin0(-1.6666654611e-1F);
1163     const SimdFloat const_cos2(2.443315711809948e-5F);
1164     const SimdFloat const_cos1(-1.388731625493765e-3F);
1165     const SimdFloat const_cos0(4.166664568298827e-2F);
1166     const SimdFloat half(0.5F);
1167     const SimdFloat one(1.0F);
1168     SimdFloat       ssign, csign;
1169     SimdFloat       x2, y, z, psin, pcos, sss, ccc;
1170     SimdFBool       m;
1171
1172 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1173     const SimdFInt32 ione(1);
1174     const SimdFInt32 itwo(2);
1175     SimdFInt32       iy;
1176
1177     z  = x * two_over_pi;
1178     iy = cvtR2I(z);
1179     y  = round(z);
1180
1181     m     = cvtIB2B((iy & ione) == SimdFInt32(0));
1182     ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
1183     csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
1184 #        else
1185     const SimdFloat quarter(0.25f);
1186     const SimdFloat minusquarter(-0.25f);
1187     SimdFloat       q;
1188     SimdFBool       m1, m2, m3;
1189
1190     /* The most obvious way to find the arguments quadrant in the unit circle
1191      * to calculate the sign is to use integer arithmetic, but that is not
1192      * present in all SIMD implementations. As an alternative, we have devised a
1193      * pure floating-point algorithm that uses truncation for argument reduction
1194      * so that we get a new value 0<=q<1 over the unit circle, and then
1195      * do floating-point comparisons with fractions. This is likely to be
1196      * slightly slower (~10%) due to the longer latencies of floating-point, so
1197      * we only use it when integer SIMD arithmetic is not present.
1198      */
1199     ssign = x;
1200     x     = abs(x);
1201     // It is critical that half-way cases are rounded down
1202     z = fma(x, two_over_pi, half);
1203     y = trunc(z);
1204     q = z * quarter;
1205     q = q - trunc(q);
1206     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1207      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1208      * This removes the 2*Pi periodicity without using any integer arithmetic.
1209      * First check if y had the value 2 or 3, set csign if true.
1210      */
1211     q = q - half;
1212     /* If we have logical operations we can work directly on the signbit, which
1213      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1214      * Thus, if you are altering defines to debug alternative code paths, the
1215      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1216      * active or inactive - you will get errors if only one is used.
1217      */
1218 #            if GMX_SIMD_HAVE_LOGICAL
1219     ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
1220     csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
1221     ssign = ssign ^ csign;
1222 #            else
1223     ssign = copysign(SimdFloat(1.0f), ssign);
1224     csign = copysign(SimdFloat(1.0f), q);
1225     csign = -csign;
1226     ssign = ssign * csign; // swap ssign if csign was set.
1227 #            endif
1228     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1229     m1 = (q < minusquarter);
1230     m2 = (setZero() <= q);
1231     m3 = (q < quarter);
1232     m2 = m2 && m3;
1233     m  = m1 || m2;
1234     // where mask is FALSE, swap sign.
1235     csign   = csign * blend(SimdFloat(-1.0f), one, m);
1236 #        endif
1237     x  = fma(y, argred0, x);
1238     x  = fma(y, argred1, x);
1239     x  = fma(y, argred2, x);
1240     x  = fma(y, argred3, x);
1241     x2 = x * x;
1242
1243     psin = fma(const_sin2, x2, const_sin1);
1244     psin = fma(psin, x2, const_sin0);
1245     psin = fma(psin, x * x2, x);
1246     pcos = fma(const_cos2, x2, const_cos1);
1247     pcos = fma(pcos, x2, const_cos0);
1248     pcos = fms(pcos, x2, half);
1249     pcos = fma(pcos, x2, one);
1250
1251     sss = blend(pcos, psin, m);
1252     ccc = blend(psin, pcos, m);
1253     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1254 #        if GMX_SIMD_HAVE_LOGICAL
1255     *sinval = sss ^ ssign;
1256     *cosval = ccc ^ csign;
1257 #        else
1258     *sinval = sss * ssign;
1259     *cosval = ccc * csign;
1260 #        endif
1261 }
1262
1263 /*! \brief SIMD float sin(x).
1264  *
1265  * \param x The argument to evaluate sin for
1266  * \result Sin(x)
1267  *
1268  * \attention Do NOT call both sin & cos if you need both results, since each of them
1269  * will then call \ref sincos and waste a factor 2 in performance.
1270  */
1271 static inline SimdFloat gmx_simdcall sin(SimdFloat x)
1272 {
1273     SimdFloat s, c;
1274     sincos(x, &s, &c);
1275     return s;
1276 }
1277
1278 /*! \brief SIMD float cos(x).
1279  *
1280  * \param x The argument to evaluate cos for
1281  * \result Cos(x)
1282  *
1283  * \attention Do NOT call both sin & cos if you need both results, since each of them
1284  * will then call \ref sincos and waste a factor 2 in performance.
1285  */
1286 static inline SimdFloat gmx_simdcall cos(SimdFloat x)
1287 {
1288     SimdFloat s, c;
1289     sincos(x, &s, &c);
1290     return c;
1291 }
1292
1293 /*! \brief SIMD float tan(x).
1294  *
1295  * \param x The argument to evaluate tan for
1296  * \result Tan(x)
1297  */
1298 static inline SimdFloat gmx_simdcall tan(SimdFloat x)
1299 {
1300     const SimdFloat argred0(-1.5703125);
1301     const SimdFloat argred1(-4.83751296997070312500e-04F);
1302     const SimdFloat argred2(-7.54953362047672271729e-08F);
1303     const SimdFloat argred3(-2.56334406825708960298e-12F);
1304     const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1305     const SimdFloat CT6(0.009498288995810566122993911);
1306     const SimdFloat CT5(0.002895755790837379295226923);
1307     const SimdFloat CT4(0.02460087336161924491836265);
1308     const SimdFloat CT3(0.05334912882656359828045988);
1309     const SimdFloat CT2(0.1333989091464957704418495);
1310     const SimdFloat CT1(0.3333307599244198227797507);
1311
1312     SimdFloat x2, p, y, z;
1313     SimdFBool m;
1314
1315 #        if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1316     SimdFInt32 iy;
1317     SimdFInt32 ione(1);
1318
1319     z  = x * two_over_pi;
1320     iy = cvtR2I(z);
1321     y  = round(z);
1322     m  = cvtIB2B((iy & ione) == ione);
1323
1324     x = fma(y, argred0, x);
1325     x = fma(y, argred1, x);
1326     x = fma(y, argred2, x);
1327     x = fma(y, argred3, x);
1328     x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1329 #        else
1330     const SimdFloat quarter(0.25f);
1331     const SimdFloat half(0.5f);
1332     const SimdFloat threequarter(0.75f);
1333     SimdFloat       w, q;
1334     SimdFBool       m1, m2, m3;
1335
1336     w     = abs(x);
1337     z     = fma(w, two_over_pi, half);
1338     y     = trunc(z);
1339     q     = z * quarter;
1340     q     = q - trunc(q);
1341     m1    = quarter <= q;
1342     m2    = q < half;
1343     m3    = threequarter <= q;
1344     m1    = m1 && m2;
1345     m     = m1 || m3;
1346     w     = fma(y, argred0, w);
1347     w     = fma(y, argred1, w);
1348     w     = fma(y, argred2, w);
1349     w     = fma(y, argred3, w);
1350     w     = blend(w, -w, m);
1351     x     = w * copysign(SimdFloat(1.0), x);
1352 #        endif
1353     x2 = x * x;
1354     p  = fma(CT6, x2, CT5);
1355     p  = fma(p, x2, CT4);
1356     p  = fma(p, x2, CT3);
1357     p  = fma(p, x2, CT2);
1358     p  = fma(p, x2, CT1);
1359     p  = fma(x2 * p, x, x);
1360
1361     p = blend(p, maskzInv(p, m), m);
1362     return p;
1363 }
1364
1365 /*! \brief SIMD float asin(x).
1366  *
1367  * \param x The argument to evaluate asin for
1368  * \result Asin(x)
1369  */
1370 static inline SimdFloat gmx_simdcall asin(SimdFloat x)
1371 {
1372     const SimdFloat limitlow(1e-4F);
1373     const SimdFloat half(0.5F);
1374     const SimdFloat one(1.0F);
1375     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1376     const SimdFloat CC5(4.2163199048E-2F);
1377     const SimdFloat CC4(2.4181311049E-2F);
1378     const SimdFloat CC3(4.5470025998E-2F);
1379     const SimdFloat CC2(7.4953002686E-2F);
1380     const SimdFloat CC1(1.6666752422E-1F);
1381     SimdFloat       xabs;
1382     SimdFloat       z, z1, z2, q, q1, q2;
1383     SimdFloat       pA, pB;
1384     SimdFBool       m, m2;
1385
1386     xabs = abs(x);
1387     m    = half < xabs;
1388     z1   = half * (one - xabs);
1389     m2   = xabs < one;
1390     q1   = z1 * maskzInvsqrt(z1, m2);
1391     q2   = xabs;
1392     z2   = q2 * q2;
1393     z    = blend(z2, z1, m);
1394     q    = blend(q2, q1, m);
1395
1396     z2 = z * z;
1397     pA = fma(CC5, z2, CC3);
1398     pB = fma(CC4, z2, CC2);
1399     pA = fma(pA, z2, CC1);
1400     pA = pA * z;
1401     z  = fma(pB, z2, pA);
1402     z  = fma(z, q, q);
1403     q2 = halfpi - z;
1404     q2 = q2 - z;
1405     z  = blend(z, q2, m);
1406
1407     m = limitlow < xabs;
1408     z = blend(xabs, z, m);
1409     z = copysign(z, x);
1410
1411     return z;
1412 }
1413
1414 /*! \brief SIMD float acos(x).
1415  *
1416  * \param x The argument to evaluate acos for
1417  * \result Acos(x)
1418  */
1419 static inline SimdFloat gmx_simdcall acos(SimdFloat x)
1420 {
1421     const SimdFloat one(1.0F);
1422     const SimdFloat half(0.5F);
1423     const SimdFloat pi(static_cast<float>(M_PI));
1424     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1425     SimdFloat       xabs;
1426     SimdFloat       z, z1, z2, z3;
1427     SimdFBool       m1, m2, m3;
1428
1429     xabs = abs(x);
1430     m1   = half < xabs;
1431     m2   = setZero() < x;
1432
1433     z  = fnma(half, xabs, half);
1434     m3 = xabs < one;
1435     z  = z * maskzInvsqrt(z, m3);
1436     z  = blend(x, z, m1);
1437     z  = asin(z);
1438
1439     z2 = z + z;
1440     z1 = pi - z2;
1441     z3 = halfpi - z;
1442     z  = blend(z1, z2, m2);
1443     z  = blend(z3, z, m1);
1444
1445     return z;
1446 }
1447
1448 /*! \brief SIMD float asin(x).
1449  *
1450  * \param x The argument to evaluate atan for
1451  * \result Atan(x), same argument/value range as standard math library.
1452  */
1453 static inline SimdFloat gmx_simdcall atan(SimdFloat x)
1454 {
1455     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1456     const SimdFloat CA17(0.002823638962581753730774F);
1457     const SimdFloat CA15(-0.01595690287649631500244F);
1458     const SimdFloat CA13(0.04250498861074447631836F);
1459     const SimdFloat CA11(-0.07489009201526641845703F);
1460     const SimdFloat CA9(0.1063479334115982055664F);
1461     const SimdFloat CA7(-0.1420273631811141967773F);
1462     const SimdFloat CA5(0.1999269574880599975585F);
1463     const SimdFloat CA3(-0.3333310186862945556640F);
1464     const SimdFloat one(1.0F);
1465     SimdFloat       x2, x3, x4, pA, pB;
1466     SimdFBool       m, m2;
1467
1468     m  = x < setZero();
1469     x  = abs(x);
1470     m2 = one < x;
1471     x  = blend(x, maskzInv(x, m2), m2);
1472
1473     x2 = x * x;
1474     x3 = x2 * x;
1475     x4 = x2 * x2;
1476     pA = fma(CA17, x4, CA13);
1477     pB = fma(CA15, x4, CA11);
1478     pA = fma(pA, x4, CA9);
1479     pB = fma(pB, x4, CA7);
1480     pA = fma(pA, x4, CA5);
1481     pB = fma(pB, x4, CA3);
1482     pA = fma(pA, x2, pB);
1483     pA = fma(pA, x3, x);
1484
1485     pA = blend(pA, halfpi - pA, m2);
1486     pA = blend(pA, -pA, m);
1487
1488     return pA;
1489 }
1490
1491 /*! \brief SIMD float atan2(y,x).
1492  *
1493  * \param y Y component of vector, any quartile
1494  * \param x X component of vector, any quartile
1495  * \result Atan(y,x), same argument/value range as standard math library.
1496  *
1497  * \note This routine should provide correct results for all finite
1498  * non-zero or positive-zero arguments. However, negative zero arguments will
1499  * be treated as positive zero, which means the return value will deviate from
1500  * the standard math library atan2(y,x) for those cases. That should not be
1501  * of any concern in Gromacs, and in particular it will not affect calculations
1502  * of angles from vectors.
1503  */
1504 static inline SimdFloat gmx_simdcall atan2(SimdFloat y, SimdFloat x)
1505 {
1506     const SimdFloat pi(static_cast<float>(M_PI));
1507     const SimdFloat halfpi(static_cast<float>(M_PI / 2.0));
1508     SimdFloat       xinv, p, aoffset;
1509     SimdFBool       mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1510
1511     mask_xnz  = x != setZero();
1512     mask_ynz  = y != setZero();
1513     mask_xlt0 = x < setZero();
1514     mask_ylt0 = y < setZero();
1515
1516     aoffset = selectByNotMask(halfpi, mask_xnz);
1517     aoffset = selectByMask(aoffset, mask_ynz);
1518
1519     aoffset = blend(aoffset, pi, mask_xlt0);
1520     aoffset = blend(aoffset, -aoffset, mask_ylt0);
1521
1522     xinv = maskzInv(x, mask_xnz);
1523     p    = y * xinv;
1524     p    = atan(p);
1525     p    = p + aoffset;
1526
1527     return p;
1528 }
1529
1530 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1531  *
1532  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1533  * \result Correction factor to coulomb force - see below for details.
1534  *
1535  * This routine is meant to enable analytical evaluation of the
1536  * direct-space PME electrostatic force to avoid tables.
1537  *
1538  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1539  * are some problems evaluating that:
1540  *
1541  * First, the error function is difficult (read: expensive) to
1542  * approxmiate accurately for intermediate to large arguments, and
1543  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1544  * Second, we now try to avoid calculating potentials in Gromacs but
1545  * use forces directly.
1546  *
1547  * We can simply things slight by noting that the PME part is really
1548  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1549  * \f[
1550  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1551  * \f]
1552  * The first term we already have from the inverse square root, so
1553  * that we can leave out of this routine.
1554  *
1555  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1556  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1557  * the range used for the minimax fit. Use your favorite plotting program
1558  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1559  *
1560  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1561  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1562  * then only use even powers. This is another minor optimization, since
1563  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1564  * the vector between the two atoms to get the vectorial force. The
1565  * fastest flops are the ones we can avoid calculating!
1566  *
1567  * So, here's how it should be used:
1568  *
1569  * 1. Calculate \f$r^2\f$.
1570  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1571  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1572  * 4. The return value is the expression:
1573  *
1574  * \f[
1575  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1576  * \f]
1577  *
1578  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1579  *
1580  *  \f[
1581  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1582  *  \f]
1583  *
1584  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1585  *
1586  *  \f[
1587  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1588  *  \f]
1589  *
1590  *    With a bit of math exercise you should be able to confirm that
1591  *    this is exactly
1592  *
1593  *  \f[
1594  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1595  *  \f]
1596  *
1597  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1598  *    and you have your force (divided by \f$r\f$). A final multiplication
1599  *    with the vector connecting the two particles and you have your
1600  *    vectorial force to add to the particles.
1601  *
1602  * This approximation achieves an error slightly lower than 1e-6
1603  * in single precision and 1e-11 in double precision
1604  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1605  * when added to \f$1/r\f$ the error will be insignificant.
1606  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1607  *
1608  */
1609 static inline SimdFloat gmx_simdcall pmeForceCorrection(SimdFloat z2)
1610 {
1611     const SimdFloat FN6(-1.7357322914161492954e-8F);
1612     const SimdFloat FN5(1.4703624142580877519e-6F);
1613     const SimdFloat FN4(-0.000053401640219807709149F);
1614     const SimdFloat FN3(0.0010054721316683106153F);
1615     const SimdFloat FN2(-0.019278317264888380590F);
1616     const SimdFloat FN1(0.069670166153766424023F);
1617     const SimdFloat FN0(-0.75225204789749321333F);
1618
1619     const SimdFloat FD4(0.0011193462567257629232F);
1620     const SimdFloat FD3(0.014866955030185295499F);
1621     const SimdFloat FD2(0.11583842382862377919F);
1622     const SimdFloat FD1(0.50736591960530292870F);
1623     const SimdFloat FD0(1.0F);
1624
1625     SimdFloat z4;
1626     SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1627
1628     z4 = z2 * z2;
1629
1630     polyFD0 = fma(FD4, z4, FD2);
1631     polyFD1 = fma(FD3, z4, FD1);
1632     polyFD0 = fma(polyFD0, z4, FD0);
1633     polyFD0 = fma(polyFD1, z2, polyFD0);
1634
1635     polyFD0 = inv(polyFD0);
1636
1637     polyFN0 = fma(FN6, z4, FN4);
1638     polyFN1 = fma(FN5, z4, FN3);
1639     polyFN0 = fma(polyFN0, z4, FN2);
1640     polyFN1 = fma(polyFN1, z4, FN1);
1641     polyFN0 = fma(polyFN0, z4, FN0);
1642     polyFN0 = fma(polyFN1, z2, polyFN0);
1643
1644     return polyFN0 * polyFD0;
1645 }
1646
1647
1648 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1649  *
1650  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1651  * \result Correction factor to coulomb potential - see below for details.
1652  *
1653  * See \ref pmeForceCorrection for details about the approximation.
1654  *
1655  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1656  * as the input argument.
1657  *
1658  * Here's how it should be used:
1659  *
1660  * 1. Calculate \f$r^2\f$.
1661  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1662  * 3. Evaluate this routine with z^2 as the argument.
1663  * 4. The return value is the expression:
1664  *
1665  *  \f[
1666  *   \frac{\mbox{erf}(z)}{z}
1667  *  \f]
1668  *
1669  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1670  *
1671  *  \f[
1672  *    \frac{\mbox{erf}(r \beta)}{r}
1673  *  \f]
1674  *
1675  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1676  *    and you have your potential.
1677  *
1678  * This approximation achieves an error slightly lower than 1e-6
1679  * in single precision and 4e-11 in double precision
1680  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1681  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1682  * when added to \f$1/r\f$ the error will be insignificant.
1683  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1684  */
1685 static inline SimdFloat gmx_simdcall pmePotentialCorrection(SimdFloat z2)
1686 {
1687     const SimdFloat VN6(1.9296833005951166339e-8F);
1688     const SimdFloat VN5(-1.4213390571557850962e-6F);
1689     const SimdFloat VN4(0.000041603292906656984871F);
1690     const SimdFloat VN3(-0.00013134036773265025626F);
1691     const SimdFloat VN2(0.038657983986041781264F);
1692     const SimdFloat VN1(0.11285044772717598220F);
1693     const SimdFloat VN0(1.1283802385263030286F);
1694
1695     const SimdFloat VD3(0.0066752224023576045451F);
1696     const SimdFloat VD2(0.078647795836373922256F);
1697     const SimdFloat VD1(0.43336185284710920150F);
1698     const SimdFloat VD0(1.0F);
1699
1700     SimdFloat z4;
1701     SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1702
1703     z4 = z2 * z2;
1704
1705     polyVD1 = fma(VD3, z4, VD1);
1706     polyVD0 = fma(VD2, z4, VD0);
1707     polyVD0 = fma(polyVD1, z2, polyVD0);
1708
1709     polyVD0 = inv(polyVD0);
1710
1711     polyVN0 = fma(VN6, z4, VN4);
1712     polyVN1 = fma(VN5, z4, VN3);
1713     polyVN0 = fma(polyVN0, z4, VN2);
1714     polyVN1 = fma(polyVN1, z4, VN1);
1715     polyVN0 = fma(polyVN0, z4, VN0);
1716     polyVN0 = fma(polyVN1, z2, polyVN0);
1717
1718     return polyVN0 * polyVD0;
1719 }
1720 #    endif
1721
1722 /*! \} */
1723
1724 #    if GMX_SIMD_HAVE_DOUBLE
1725
1726
1727 /*! \name Double precision SIMD math functions
1728  *
1729  *  \note In most cases you should use the real-precision functions instead.
1730  *  \{
1731  */
1732
1733 /****************************************
1734  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1735  ****************************************/
1736
1737 #        if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1738 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1739  *
1740  * \param x Values to set sign for
1741  * \param y Values used to set sign
1742  * \return  Magnitude of x, sign of y
1743  */
1744 static inline SimdDouble gmx_simdcall copysign(SimdDouble x, SimdDouble y)
1745 {
1746 #            if GMX_SIMD_HAVE_LOGICAL
1747     return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1748 #            else
1749     return blend(abs(x), -abs(x), (y < setZero()));
1750 #            endif
1751 }
1752 #        endif
1753
1754 #        if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1755 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1756  *
1757  * This is a low-level routine that should only be used by SIMD math routine
1758  * that evaluates the inverse square root.
1759  *
1760  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1761  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
1762  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1763  */
1764 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
1765 {
1766     SimdDouble tmp1 = x * lu;
1767     SimdDouble tmp2 = SimdDouble(-0.5) * lu;
1768     tmp1            = fma(tmp1, lu, SimdDouble(-3.0));
1769     return tmp1 * tmp2;
1770 }
1771 #        endif
1772
1773 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1774  *
1775  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1776  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1777  *           For the single precision implementation this is obviously always
1778  *           true for positive values, but for double precision it adds an
1779  *           extra restriction since the first lookup step might have to be
1780  *           performed in single precision on some architectures. Note that the
1781  *           responsibility for checking falls on you - this routine does not
1782  *           check arguments.
1783  *
1784  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
1785  */
1786 static inline SimdDouble gmx_simdcall invsqrt(SimdDouble x)
1787 {
1788     SimdDouble lu = rsqrt(x);
1789 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1790     lu = rsqrtIter(lu, x);
1791 #        endif
1792 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1793     lu = rsqrtIter(lu, x);
1794 #        endif
1795 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1796     lu = rsqrtIter(lu, x);
1797 #        endif
1798 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1799     lu = rsqrtIter(lu, x);
1800 #        endif
1801     return lu;
1802 }
1803
1804 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1805  *
1806  * \param x0  First set of arguments, x0 must be in single range (see below).
1807  * \param x1  Second set of arguments, x1 must be in single range (see below).
1808  * \param[out] out0  Result 1/sqrt(x0)
1809  * \param[out] out1  Result 1/sqrt(x1)
1810  *
1811  *  In particular for double precision we can sometimes calculate square root
1812  *  pairs slightly faster by using single precision until the very last step.
1813  *
1814  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1815  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
1816  *       For the single precision implementation this is obviously always
1817  *       true for positive values, but for double precision it adds an
1818  *       extra restriction since the first lookup step might have to be
1819  *       performed in single precision on some architectures. Note that the
1820  *       responsibility for checking falls on you - this routine does not
1821  *       check arguments.
1822  */
1823 static inline void gmx_simdcall invsqrtPair(SimdDouble x0, SimdDouble x1, SimdDouble* out0, SimdDouble* out1)
1824 {
1825 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1826                 && (GMX_SIMD_RSQRT_BITS < 22)
1827     SimdFloat  xf  = cvtDD2F(x0, x1);
1828     SimdFloat  luf = rsqrt(xf);
1829     SimdDouble lu0, lu1;
1830     // Intermediate target is single - mantissa+1 bits
1831 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1832     luf = rsqrtIter(luf, xf);
1833 #            endif
1834 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1835     luf = rsqrtIter(luf, xf);
1836 #            endif
1837 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1838     luf = rsqrtIter(luf, xf);
1839 #            endif
1840     cvtF2DD(luf, &lu0, &lu1);
1841     // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1842 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1843     lu0 = rsqrtIter(lu0, x0);
1844     lu1 = rsqrtIter(lu1, x1);
1845 #            endif
1846 #            if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1847     lu0 = rsqrtIter(lu0, x0);
1848     lu1 = rsqrtIter(lu1, x1);
1849 #            endif
1850     *out0 = lu0;
1851     *out1 = lu1;
1852 #        else
1853     *out0 = invsqrt(x0);
1854     *out1 = invsqrt(x1);
1855 #        endif
1856 }
1857
1858 #        if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1859 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1860  *
1861  * This is a low-level routine that should only be used by SIMD math routine
1862  * that evaluates the reciprocal.
1863  *
1864  *  \param lu Approximation of 1/x, typically obtained from lookup.
1865  *  \param x  The reference (starting) value x for which we want 1/x.
1866  *  \return   An improved approximation with roughly twice as many bits of accuracy.
1867  */
1868 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
1869 {
1870     return lu * fnma(lu, x, SimdDouble(2.0));
1871 }
1872 #        endif
1873
1874 /*! \brief Calculate 1/x for SIMD double.
1875  *
1876  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1877  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
1878  *           For the single precision implementation this is obviously always
1879  *           true for positive values, but for double precision it adds an
1880  *           extra restriction since the first lookup step might have to be
1881  *           performed in single precision on some architectures. Note that the
1882  *           responsibility for checking falls on you - this routine does not
1883  *           check arguments.
1884  *
1885  *  \return 1/x. Result is undefined if your argument was invalid.
1886  */
1887 static inline SimdDouble gmx_simdcall inv(SimdDouble x)
1888 {
1889     SimdDouble lu = rcp(x);
1890 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1891     lu = rcpIter(lu, x);
1892 #        endif
1893 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1894     lu = rcpIter(lu, x);
1895 #        endif
1896 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1897     lu = rcpIter(lu, x);
1898 #        endif
1899 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1900     lu = rcpIter(lu, x);
1901 #        endif
1902     return lu;
1903 }
1904
1905 /*! \brief Division for SIMD doubles
1906  *
1907  * \param nom    Nominator
1908  * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1909  *               For single precision this is equivalent to a nonzero argument,
1910  *               but in double precision it adds an extra restriction since
1911  *               the first lookup step might have to be performed in single
1912  *               precision on some architectures. Note that the responsibility
1913  *               for checking falls on you - this routine does not check arguments.
1914  *
1915  * \return nom/denom
1916  *
1917  * \note This function does not use any masking to avoid problems with
1918  *       zero values in the denominator.
1919  */
1920 static inline SimdDouble gmx_simdcall operator/(SimdDouble nom, SimdDouble denom)
1921 {
1922     return nom * inv(denom);
1923 }
1924
1925
1926 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1927  *
1928  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1929  *  Illegal values in the masked-out elements will not lead to
1930  *  floating-point exceptions.
1931  *
1932  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1933  *           GMX_FLOAT_MAX for masked-in entries.
1934  *           See \ref invsqrt for the discussion about argument restrictions.
1935  *  \param m Mask
1936  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
1937  *          entry was not masked, and 0.0 for masked-out entries.
1938  */
1939 static inline SimdDouble maskzInvsqrt(SimdDouble x, SimdDBool m)
1940 {
1941     SimdDouble lu = maskzRsqrt(x, m);
1942 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1943     lu = rsqrtIter(lu, x);
1944 #        endif
1945 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1946     lu = rsqrtIter(lu, x);
1947 #        endif
1948 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1949     lu = rsqrtIter(lu, x);
1950 #        endif
1951 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1952     lu = rsqrtIter(lu, x);
1953 #        endif
1954     return lu;
1955 }
1956
1957 /*! \brief Calculate 1/x for SIMD double, masked version.
1958  *
1959  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1960  *           GMX_FLOAT_MAX for masked-in entries.
1961  *           See \ref invsqrt for the discussion about argument restrictions.
1962  *  \param m Mask
1963  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1964  */
1965 static inline SimdDouble gmx_simdcall maskzInv(SimdDouble x, SimdDBool m)
1966 {
1967     SimdDouble lu = maskzRcp(x, m);
1968 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1969     lu = rcpIter(lu, x);
1970 #        endif
1971 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1972     lu = rcpIter(lu, x);
1973 #        endif
1974 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1975     lu = rcpIter(lu, x);
1976 #        endif
1977 #        if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1978     lu = rcpIter(lu, x);
1979 #        endif
1980     return lu;
1981 }
1982
1983
1984 /*! \brief Calculate sqrt(x) for SIMD doubles.
1985  *
1986  *  \copydetails sqrt(SimdFloat)
1987  */
1988 template<MathOptimization opt = MathOptimization::Safe>
1989 static inline SimdDouble gmx_simdcall sqrt(SimdDouble x)
1990 {
1991     if (opt == MathOptimization::Safe)
1992     {
1993         // As we might use a float version of rsqrt, we mask out small values
1994         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1995         return res * x;
1996     }
1997     else
1998     {
1999         return x * invsqrt(x);
2000     }
2001 }
2002
2003 /*! \brief Cube root for SIMD doubles
2004  *
2005  * \param x      Argument to calculate cube root of. Can be negative or zero,
2006  *               but NaN or Inf values are not supported. Denormal values will
2007  *               be treated as 0.0.
2008  * \return       Cube root of x.
2009  */
2010 static inline SimdDouble gmx_simdcall cbrt(SimdDouble x)
2011 {
2012     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2013     const SimdDouble minDouble(std::numeric_limits<double>::min());
2014     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2015     // To avoid clang warnings about fragile integer division mixed with FP, we let
2016     // the divided value (1023/3=341) be the original constant.
2017     const std::int32_t offsetDiv3(341);
2018     const SimdDouble   c6(-0.145263899385486377);
2019     const SimdDouble   c5(0.784932344976639262);
2020     const SimdDouble   c4(-1.83469277483613086);
2021     const SimdDouble   c3(2.44693122563534430);
2022     const SimdDouble   c2(-2.11499494167371287);
2023     const SimdDouble   c1(1.50819193781584896);
2024     const SimdDouble   c0(0.354895765043919860);
2025     const SimdDouble   one(1.0);
2026     const SimdDouble   two(2.0);
2027     const SimdDouble   three(3.0);
2028     const SimdDouble   oneThird(1.0 / 3.0);
2029     const SimdDouble   cbrt2(1.2599210498948731648);
2030     const SimdDouble   sqrCbrt2(1.5874010519681994748);
2031
2032     // See the single precision routines for documentation of the algorithm
2033
2034     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2035     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
2036     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
2037
2038     SimdDInt32 exponent;
2039     SimdDouble y        = frexp(xAbs, &exponent);
2040     SimdDouble z        = fma(y, c6, c5);
2041     z                   = fma(z, y, c4);
2042     z                   = fma(z, y, c3);
2043     z                   = fma(z, y, c2);
2044     z                   = fma(z, y, c1);
2045     z                   = fma(z, y, c0);
2046     SimdDouble w        = z * z * z;
2047     SimdDouble nom      = z * fma(two, y, w);
2048     SimdDouble invDenom = inv(fma(two, w, y));
2049
2050     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2051     SimdDouble offsetExpDiv3 =
2052             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2053     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
2054     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
2055     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
2056     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
2057     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2058     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
2059     return result;
2060 }
2061
2062 /*! \brief Inverse cube root for SIMD doubles.
2063  *
2064  * \param x      Argument to calculate cube root of. Can be positive or
2065  *               negative, but the magnitude cannot be lower than
2066  *               the smallest normal number.
2067  * \return       Cube root of x. Undefined for values that don't
2068  *               fulfill the restriction of abs(x) > minDouble.
2069  */
2070 static inline SimdDouble gmx_simdcall invcbrt(SimdDouble x)
2071 {
2072     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2073     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2074     // To avoid clang warnings about fragile integer division mixed with FP, we let
2075     // the divided value (1023/3=341) be the original constant.
2076     const std::int32_t offsetDiv3(341);
2077     const SimdDouble   c6(-0.145263899385486377);
2078     const SimdDouble   c5(0.784932344976639262);
2079     const SimdDouble   c4(-1.83469277483613086);
2080     const SimdDouble   c3(2.44693122563534430);
2081     const SimdDouble   c2(-2.11499494167371287);
2082     const SimdDouble   c1(1.50819193781584896);
2083     const SimdDouble   c0(0.354895765043919860);
2084     const SimdDouble   one(1.0);
2085     const SimdDouble   two(2.0);
2086     const SimdDouble   three(3.0);
2087     const SimdDouble   oneThird(1.0 / 3.0);
2088     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
2089     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
2090
2091     // See the single precision routines for documentation of the algorithm
2092
2093     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2094     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2095
2096     SimdDInt32 exponent;
2097     SimdDouble y         = frexp(xAbs, &exponent);
2098     SimdDouble z         = fma(y, c6, c5);
2099     z                    = fma(z, y, c4);
2100     z                    = fma(z, y, c3);
2101     z                    = fma(z, y, c2);
2102     z                    = fma(z, y, c1);
2103     z                    = fma(z, y, c0);
2104     SimdDouble w         = z * z * z;
2105     SimdDouble nom       = fma(two, w, y);
2106     SimdDouble invDenom  = inv(z * fma(two, y, w));
2107     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2108     SimdDouble offsetExpDiv3 =
2109             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2110     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
2111     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
2112     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
2113     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
2114     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
2115     SimdDouble result    = ldexp(fraction, expDiv3);
2116     return result;
2117 }
2118
2119 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2120  *
2121  * \param x Argument, should be >0.
2122  * \result The base-2 logarithm of x. Undefined if argument is invalid.
2123  */
2124 static inline SimdDouble gmx_simdcall log2(SimdDouble x)
2125 {
2126 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2127     // Just rescale if native log2() is not present, but log is.
2128     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
2129 #        else
2130     const SimdDouble one(1.0);
2131     const SimdDouble two(2.0);
2132     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2133     const SimdDouble CL15(0.2138031565795550370534528);
2134     const SimdDouble CL13(0.2208884091496370882801159);
2135     const SimdDouble CL11(0.2623358279761824340958754);
2136     const SimdDouble CL9(0.3205984930182496084327681);
2137     const SimdDouble CL7(0.4121985864521960363227038);
2138     const SimdDouble CL5(0.5770780163410746954610886);
2139     const SimdDouble CL3(0.9617966939260027547931031);
2140     const SimdDouble CL1(2.885390081777926774009302);
2141     SimdDouble       fExp, x2, p;
2142     SimdDBool        m;
2143     SimdDInt32       iExp;
2144
2145     x    = frexp(x, &iExp);
2146     fExp = cvtI2R(iExp);
2147
2148     m = x < invsqrt2;
2149     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2150     fExp = fExp - selectByMask(one, m);
2151     x    = x * blend(one, two, m);
2152
2153     x  = (x - one) * inv(x + one);
2154     x2 = x * x;
2155
2156     p = fma(CL15, x2, CL13);
2157     p = fma(p, x2, CL11);
2158     p = fma(p, x2, CL9);
2159     p = fma(p, x2, CL7);
2160     p = fma(p, x2, CL5);
2161     p = fma(p, x2, CL3);
2162     p = fma(p, x2, CL1);
2163     p = fma(p, x, fExp);
2164
2165     return p;
2166 #        endif
2167 }
2168
2169 #        if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2170 /*! \brief SIMD double log(x). This is the natural logarithm.
2171  *
2172  * \param x Argument, should be >0.
2173  * \result The natural logarithm of x. Undefined if argument is invalid.
2174  */
2175 static inline SimdDouble gmx_simdcall log(SimdDouble x)
2176 {
2177     const SimdDouble one(1.0);
2178     const SimdDouble two(2.0);
2179     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2180     const SimdDouble corr(0.693147180559945286226764);
2181     const SimdDouble CL15(0.148197055177935105296783);
2182     const SimdDouble CL13(0.153108178020442575739679);
2183     const SimdDouble CL11(0.181837339521549679055568);
2184     const SimdDouble CL9(0.22222194152736701733275);
2185     const SimdDouble CL7(0.285714288030134544449368);
2186     const SimdDouble CL5(0.399999999989941956712869);
2187     const SimdDouble CL3(0.666666666666685503450651);
2188     const SimdDouble CL1(2.0);
2189     SimdDouble       fExp, x2, p;
2190     SimdDBool        m;
2191     SimdDInt32       iExp;
2192
2193     x    = frexp(x, &iExp);
2194     fExp = cvtI2R(iExp);
2195
2196     m = x < invsqrt2;
2197     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2198     fExp = fExp - selectByMask(one, m);
2199     x    = x * blend(one, two, m);
2200
2201     x  = (x - one) * inv(x + one);
2202     x2 = x * x;
2203
2204     p = fma(CL15, x2, CL13);
2205     p = fma(p, x2, CL11);
2206     p = fma(p, x2, CL9);
2207     p = fma(p, x2, CL7);
2208     p = fma(p, x2, CL5);
2209     p = fma(p, x2, CL3);
2210     p = fma(p, x2, CL1);
2211     p = fma(p, x, corr * fExp);
2212
2213     return p;
2214 }
2215 #        endif
2216
2217 #        if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2218 /*! \brief SIMD double 2^x.
2219  *
2220  * \copydetails exp2(SimdFloat)
2221  */
2222 template<MathOptimization opt = MathOptimization::Safe>
2223 static inline SimdDouble gmx_simdcall exp2(SimdDouble x)
2224 {
2225     const SimdDouble CE11(4.435280790452730022081181e-10);
2226     const SimdDouble CE10(7.074105630863314448024247e-09);
2227     const SimdDouble CE9(1.017819803432096698472621e-07);
2228     const SimdDouble CE8(1.321543308956718799557863e-06);
2229     const SimdDouble CE7(0.00001525273348995851746990884);
2230     const SimdDouble CE6(0.0001540353046251466849082632);
2231     const SimdDouble CE5(0.001333355814678995257307880);
2232     const SimdDouble CE4(0.009618129107588335039176502);
2233     const SimdDouble CE3(0.05550410866481992147457793);
2234     const SimdDouble CE2(0.2402265069591015620470894);
2235     const SimdDouble CE1(0.6931471805599453304615075);
2236     const SimdDouble one(1.0);
2237
2238     SimdDouble intpart;
2239     SimdDouble fexppart;
2240     SimdDouble p;
2241
2242     // Large negative values are valid arguments to exp2(), so there are two
2243     // things we need to account for:
2244     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2245     //    zero and we can no longer multiply with it. There are special IEEE
2246     //    formats to handle this range, but for now we have to accept that
2247     //    we cannot handle those arguments. If input value becomes even more
2248     //    negative, it will start to loop and we would end up with invalid
2249     //    exponents. Thus, we need to limit or mask this.
2250     // 2. For VERY large negative values, we will have problems that the
2251     //    subtraction to get the fractional part loses accuracy, and then we
2252     //    can end up with overflows in the polynomial.
2253     //
2254     // For now, we handle this by forwarding the math optimization setting to
2255     // ldexp, where the routine will return zero for very small arguments.
2256     //
2257     // However, before doing that we need to make sure we do not call cvtR2I
2258     // with an argument that is so negative it cannot be converted to an integer.
2259     if (opt == MathOptimization::Safe)
2260     {
2261         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
2262     }
2263
2264     fexppart = ldexp<opt>(one, cvtR2I(x));
2265     intpart  = round(x);
2266     x        = x - intpart;
2267
2268     p = fma(CE11, x, CE10);
2269     p = fma(p, x, CE9);
2270     p = fma(p, x, CE8);
2271     p = fma(p, x, CE7);
2272     p = fma(p, x, CE6);
2273     p = fma(p, x, CE5);
2274     p = fma(p, x, CE4);
2275     p = fma(p, x, CE3);
2276     p = fma(p, x, CE2);
2277     p = fma(p, x, CE1);
2278     p = fma(p, x, one);
2279     x = p * fexppart;
2280     return x;
2281 }
2282 #        endif
2283
2284 #        if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2285 /*! \brief SIMD double exp(x).
2286  *
2287  * \copydetails exp(SimdFloat)
2288  */
2289 template<MathOptimization opt = MathOptimization::Safe>
2290 static inline SimdDouble gmx_simdcall exp(SimdDouble x)
2291 {
2292     const SimdDouble argscale(1.44269504088896340735992468100);
2293     const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
2294     const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2295     const SimdDouble CE12(2.078375306791423699350304e-09);
2296     const SimdDouble CE11(2.518173854179933105218635e-08);
2297     const SimdDouble CE10(2.755842049600488770111608e-07);
2298     const SimdDouble CE9(2.755691815216689746619849e-06);
2299     const SimdDouble CE8(2.480158383706245033920920e-05);
2300     const SimdDouble CE7(0.0001984127043518048611841321);
2301     const SimdDouble CE6(0.001388888889360258341755930);
2302     const SimdDouble CE5(0.008333333332907368102819109);
2303     const SimdDouble CE4(0.04166666666663836745814631);
2304     const SimdDouble CE3(0.1666666666666796929434570);
2305     const SimdDouble CE2(0.5);
2306     const SimdDouble one(1.0);
2307     SimdDouble       fexppart;
2308     SimdDouble       intpart;
2309     SimdDouble       y, p;
2310
2311     // Large negative values are valid arguments to exp2(), so there are two
2312     // things we need to account for:
2313     // 1. When the exponents reaches -1023, the (biased) exponent field will be
2314     //    zero and we can no longer multiply with it. There are special IEEE
2315     //    formats to handle this range, but for now we have to accept that
2316     //    we cannot handle those arguments. If input value becomes even more
2317     //    negative, it will start to loop and we would end up with invalid
2318     //    exponents. Thus, we need to limit or mask this.
2319     // 2. For VERY large negative values, we will have problems that the
2320     //    subtraction to get the fractional part loses accuracy, and then we
2321     //    can end up with overflows in the polynomial.
2322     //
2323     // For now, we handle this by forwarding the math optimization setting to
2324     // ldexp, where the routine will return zero for very small arguments.
2325     //
2326     // However, before doing that we need to make sure we do not call cvtR2I
2327     // with an argument that is so negative it cannot be converted to an integer
2328     // after being multiplied by argscale.
2329
2330     if (opt == MathOptimization::Safe)
2331     {
2332         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
2333     }
2334
2335     y = x * argscale;
2336
2337     fexppart = ldexp<opt>(one, cvtR2I(y));
2338     intpart  = round(y);
2339
2340     // Extended precision arithmetics
2341     x = fma(invargscale0, intpart, x);
2342     x = fma(invargscale1, intpart, x);
2343
2344     p = fma(CE12, x, CE11);
2345     p = fma(p, x, CE10);
2346     p = fma(p, x, CE9);
2347     p = fma(p, x, CE8);
2348     p = fma(p, x, CE7);
2349     p = fma(p, x, CE6);
2350     p = fma(p, x, CE5);
2351     p = fma(p, x, CE4);
2352     p = fma(p, x, CE3);
2353     p = fma(p, x, CE2);
2354     p = fma(p, x * x, x);
2355 #            if GMX_SIMD_HAVE_FMA
2356     x = fma(p, fexppart, fexppart);
2357 #            else
2358     x = (p + one) * fexppart;
2359 #            endif
2360
2361     return x;
2362 }
2363 #        endif
2364
2365 /*! \brief SIMD double pow(x,y)
2366  *
2367  * This returns x^y for SIMD values.
2368  *
2369  * \tparam opt If this is changed from the default (safe) into the unsafe
2370  *             option, there are no guarantees about correct results for x==0.
2371  *
2372  * \param x Base.
2373  *
2374  * \param y exponent.
2375  *
2376  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2377  *         depending on the underlying implementation. If unsafe optimizations
2378  *         are enabled, this is also true for x==0.
2379  *
2380  * \warning You cannot rely on this implementation returning inf for arguments
2381  *          that cause overflow. If you have some very large
2382  *          values and need to rely on getting a valid numerical output,
2383  *          take the minimum of your variable and the largest valid argument
2384  *          before calling this routine.
2385  */
2386 template<MathOptimization opt = MathOptimization::Safe>
2387 static inline SimdDouble gmx_simdcall pow(SimdDouble x, SimdDouble y)
2388 {
2389     SimdDouble xcorr;
2390
2391     if (opt == MathOptimization::Safe)
2392     {
2393         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
2394     }
2395     else
2396     {
2397         xcorr = x;
2398     }
2399
2400     SimdDouble result = exp2<opt>(y * log2(xcorr));
2401
2402     if (opt == MathOptimization::Safe)
2403     {
2404         // if x==0 and y>0 we explicitly set the result to 0.0
2405         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2406         result = blend(result, setZero(), x == setZero() && setZero() < y);
2407     }
2408
2409     return result;
2410 }
2411
2412
2413 /*! \brief SIMD double erf(x).
2414  *
2415  * \param x The value to calculate erf(x) for.
2416  * \result erf(x)
2417  *
2418  * This routine achieves very close to full precision, but we do not care about
2419  * the last bit or the subnormal result range.
2420  */
2421 static inline SimdDouble gmx_simdcall erf(SimdDouble x)
2422 {
2423     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2424     const SimdDouble CAP4(-0.431780540597889301512e-4);
2425     const SimdDouble CAP3(-0.00578562306260059236059);
2426     const SimdDouble CAP2(-0.028593586920219752446);
2427     const SimdDouble CAP1(-0.315924962948621698209);
2428     const SimdDouble CAP0(0.14952975608477029151);
2429
2430     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2431     const SimdDouble CAQ4(0.00015126584532155383535);
2432     const SimdDouble CAQ3(0.00536692680669480725423);
2433     const SimdDouble CAQ2(0.0668686825594046122636);
2434     const SimdDouble CAQ1(0.402604990869284362773);
2435     // CAQ0 == 1.0
2436     const SimdDouble CAoffset(0.9788494110107421875);
2437
2438     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2439     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2440     const SimdDouble CBP5(0.00119770193298159629350136085658);
2441     const SimdDouble CBP4(0.0164944422378370965881008942733);
2442     const SimdDouble CBP3(0.0984581468691775932063932439252);
2443     const SimdDouble CBP2(0.317364595806937763843589437418);
2444     const SimdDouble CBP1(0.554167062641455850932670067075);
2445     const SimdDouble CBP0(0.427583576155807163756925301060);
2446     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2447     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2448     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2449     const SimdDouble CBQ4(0.955364736493055670530981883072);
2450     const SimdDouble CBQ3(2.36815675631420037315349279199);
2451     const SimdDouble CBQ2(3.55261649184083035537184223542);
2452     const SimdDouble CBQ1(2.93501136050160872574376997993);
2453     // CBQ0 == 1.0
2454
2455     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2456     const SimdDouble CCP6(-2.8175401114513378771);
2457     const SimdDouble CCP5(-3.22729451764143718517);
2458     const SimdDouble CCP4(-2.5518551727311523996);
2459     const SimdDouble CCP3(-0.687717681153649930619);
2460     const SimdDouble CCP2(-0.212652252872804219852);
2461     const SimdDouble CCP1(0.0175389834052493308818);
2462     const SimdDouble CCP0(0.00628057170626964891937);
2463
2464     const SimdDouble CCQ6(5.48409182238641741584);
2465     const SimdDouble CCQ5(13.5064170191802889145);
2466     const SimdDouble CCQ4(22.9367376522880577224);
2467     const SimdDouble CCQ3(15.930646027911794143);
2468     const SimdDouble CCQ2(11.0567237927800161565);
2469     const SimdDouble CCQ1(2.79257750980575282228);
2470     // CCQ0 == 1.0
2471     const SimdDouble CCoffset(0.5579090118408203125);
2472
2473     const SimdDouble one(1.0);
2474     const SimdDouble two(2.0);
2475     const SimdDouble minFloat(std::numeric_limits<float>::min());
2476
2477     SimdDouble xabs, x2, x4, t, t2, w, w2;
2478     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2479     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2480     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2481     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2482     SimdDouble expmx2;
2483     SimdDBool  mask, mask_erf, notmask_erf;
2484
2485     // Calculate erf()
2486     xabs        = abs(x);
2487     mask_erf    = (xabs < one);
2488     notmask_erf = (one <= xabs);
2489     x2          = x * x;
2490     x4          = x2 * x2;
2491
2492     PolyAP0 = fma(CAP4, x4, CAP2);
2493     PolyAP1 = fma(CAP3, x4, CAP1);
2494     PolyAP0 = fma(PolyAP0, x4, CAP0);
2495     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2496
2497     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2498     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2499     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2500     PolyAQ0 = fma(PolyAQ0, x4, one);
2501     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2502
2503     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2504     res_erf = CAoffset + res_erf;
2505     res_erf = x * res_erf;
2506
2507     // Calculate erfc() in range [1,4.5]
2508     t  = xabs - one;
2509     t2 = t * t;
2510
2511     PolyBP0 = fma(CBP6, t2, CBP4);
2512     PolyBP1 = fma(CBP5, t2, CBP3);
2513     PolyBP0 = fma(PolyBP0, t2, CBP2);
2514     PolyBP1 = fma(PolyBP1, t2, CBP1);
2515     PolyBP0 = fma(PolyBP0, t2, CBP0);
2516     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2517
2518     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2519     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2520     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2521     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2522     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2523     PolyBQ0 = fma(PolyBQ0, t2, one);
2524     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2525
2526     // The denominator polynomial can be zero outside the range
2527     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2528
2529     res_erfcB = res_erfcB * xabs;
2530
2531     // Calculate erfc() in range [4.5,inf]
2532     w  = maskzInv(xabs, notmask_erf && (minFloat <= xabs));
2533     w2 = w * w;
2534
2535     PolyCP0 = fma(CCP6, w2, CCP4);
2536     PolyCP1 = fma(CCP5, w2, CCP3);
2537     PolyCP0 = fma(PolyCP0, w2, CCP2);
2538     PolyCP1 = fma(PolyCP1, w2, CCP1);
2539     PolyCP0 = fma(PolyCP0, w2, CCP0);
2540     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2541
2542     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2543     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2544     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2545     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2546     PolyCQ0 = fma(PolyCQ0, w2, one);
2547     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2548
2549     expmx2 = exp(-x2);
2550
2551     // The denominator polynomial can be zero outside the range
2552     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2553     res_erfcC = res_erfcC + CCoffset;
2554     res_erfcC = res_erfcC * w;
2555
2556     mask     = (SimdDouble(4.5) < xabs);
2557     res_erfc = blend(res_erfcB, res_erfcC, mask);
2558
2559     res_erfc = res_erfc * expmx2;
2560
2561     // erfc(x<0) = 2-erfc(|x|)
2562     mask     = (x < setZero());
2563     res_erfc = blend(res_erfc, two - res_erfc, mask);
2564
2565     // Select erf() or erfc()
2566     res = blend(one - res_erfc, res_erf, mask_erf);
2567
2568     return res;
2569 }
2570
2571 /*! \brief SIMD double erfc(x).
2572  *
2573  * \param x The value to calculate erfc(x) for.
2574  * \result erfc(x)
2575  *
2576  * This routine achieves full precision (bar the last bit) over most of the
2577  * input range, but for large arguments where the result is getting close
2578  * to the minimum representable numbers we accept slightly larger errors
2579  * (think results that are in the ballpark of 10^-200 for double)
2580  * since that is not relevant for MD.
2581  */
2582 static inline SimdDouble gmx_simdcall erfc(SimdDouble x)
2583 {
2584     // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2585     const SimdDouble CAP4(-0.431780540597889301512e-4);
2586     const SimdDouble CAP3(-0.00578562306260059236059);
2587     const SimdDouble CAP2(-0.028593586920219752446);
2588     const SimdDouble CAP1(-0.315924962948621698209);
2589     const SimdDouble CAP0(0.14952975608477029151);
2590
2591     const SimdDouble CAQ5(-0.374089300177174709737e-5);
2592     const SimdDouble CAQ4(0.00015126584532155383535);
2593     const SimdDouble CAQ3(0.00536692680669480725423);
2594     const SimdDouble CAQ2(0.0668686825594046122636);
2595     const SimdDouble CAQ1(0.402604990869284362773);
2596     // CAQ0 == 1.0
2597     const SimdDouble CAoffset(0.9788494110107421875);
2598
2599     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2600     const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2601     const SimdDouble CBP5(0.00119770193298159629350136085658);
2602     const SimdDouble CBP4(0.0164944422378370965881008942733);
2603     const SimdDouble CBP3(0.0984581468691775932063932439252);
2604     const SimdDouble CBP2(0.317364595806937763843589437418);
2605     const SimdDouble CBP1(0.554167062641455850932670067075);
2606     const SimdDouble CBP0(0.427583576155807163756925301060);
2607     const SimdDouble CBQ7(0.00212288829699830145976198384930);
2608     const SimdDouble CBQ6(0.0334810979522685300554606393425);
2609     const SimdDouble CBQ5(0.2361713785181450957579508850717);
2610     const SimdDouble CBQ4(0.955364736493055670530981883072);
2611     const SimdDouble CBQ3(2.36815675631420037315349279199);
2612     const SimdDouble CBQ2(3.55261649184083035537184223542);
2613     const SimdDouble CBQ1(2.93501136050160872574376997993);
2614     // CBQ0 == 1.0
2615
2616     // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2617     const SimdDouble CCP6(-2.8175401114513378771);
2618     const SimdDouble CCP5(-3.22729451764143718517);
2619     const SimdDouble CCP4(-2.5518551727311523996);
2620     const SimdDouble CCP3(-0.687717681153649930619);
2621     const SimdDouble CCP2(-0.212652252872804219852);
2622     const SimdDouble CCP1(0.0175389834052493308818);
2623     const SimdDouble CCP0(0.00628057170626964891937);
2624
2625     const SimdDouble CCQ6(5.48409182238641741584);
2626     const SimdDouble CCQ5(13.5064170191802889145);
2627     const SimdDouble CCQ4(22.9367376522880577224);
2628     const SimdDouble CCQ3(15.930646027911794143);
2629     const SimdDouble CCQ2(11.0567237927800161565);
2630     const SimdDouble CCQ1(2.79257750980575282228);
2631     // CCQ0 == 1.0
2632     const SimdDouble CCoffset(0.5579090118408203125);
2633
2634     const SimdDouble one(1.0);
2635     const SimdDouble two(2.0);
2636     const SimdDouble minFloat(std::numeric_limits<float>::min());
2637
2638     SimdDouble xabs, x2, x4, t, t2, w, w2;
2639     SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2640     SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2641     SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2642     SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2643     SimdDouble expmx2;
2644     SimdDBool  mask, mask_erf, notmask_erf;
2645
2646     // Calculate erf()
2647     xabs        = abs(x);
2648     mask_erf    = (xabs < one);
2649     notmask_erf = (one <= xabs);
2650     x2          = x * x;
2651     x4          = x2 * x2;
2652
2653     PolyAP0 = fma(CAP4, x4, CAP2);
2654     PolyAP1 = fma(CAP3, x4, CAP1);
2655     PolyAP0 = fma(PolyAP0, x4, CAP0);
2656     PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2657     PolyAQ1 = fma(CAQ5, x4, CAQ3);
2658     PolyAQ0 = fma(CAQ4, x4, CAQ2);
2659     PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2660     PolyAQ0 = fma(PolyAQ0, x4, one);
2661     PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2662
2663     res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2664     res_erf = CAoffset + res_erf;
2665     res_erf = x * res_erf;
2666
2667     // Calculate erfc() in range [1,4.5]
2668     t  = xabs - one;
2669     t2 = t * t;
2670
2671     PolyBP0 = fma(CBP6, t2, CBP4);
2672     PolyBP1 = fma(CBP5, t2, CBP3);
2673     PolyBP0 = fma(PolyBP0, t2, CBP2);
2674     PolyBP1 = fma(PolyBP1, t2, CBP1);
2675     PolyBP0 = fma(PolyBP0, t2, CBP0);
2676     PolyBP0 = fma(PolyBP1, t, PolyBP0);
2677
2678     PolyBQ1 = fma(CBQ7, t2, CBQ5);
2679     PolyBQ0 = fma(CBQ6, t2, CBQ4);
2680     PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2681     PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2682     PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2683     PolyBQ0 = fma(PolyBQ0, t2, one);
2684     PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2685
2686     // The denominator polynomial can be zero outside the range
2687     res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2688
2689     res_erfcB = res_erfcB * xabs;
2690
2691     // Calculate erfc() in range [4.5,inf]
2692     // Note that 1/x can only handle single precision!
2693     w  = maskzInv(xabs, minFloat <= xabs);
2694     w2 = w * w;
2695
2696     PolyCP0 = fma(CCP6, w2, CCP4);
2697     PolyCP1 = fma(CCP5, w2, CCP3);
2698     PolyCP0 = fma(PolyCP0, w2, CCP2);
2699     PolyCP1 = fma(PolyCP1, w2, CCP1);
2700     PolyCP0 = fma(PolyCP0, w2, CCP0);
2701     PolyCP0 = fma(PolyCP1, w, PolyCP0);
2702
2703     PolyCQ0 = fma(CCQ6, w2, CCQ4);
2704     PolyCQ1 = fma(CCQ5, w2, CCQ3);
2705     PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2706     PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2707     PolyCQ0 = fma(PolyCQ0, w2, one);
2708     PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2709
2710     expmx2 = exp(-x2);
2711
2712     // The denominator polynomial can be zero outside the range
2713     res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2714     res_erfcC = res_erfcC + CCoffset;
2715     res_erfcC = res_erfcC * w;
2716
2717     mask     = (SimdDouble(4.5) < xabs);
2718     res_erfc = blend(res_erfcB, res_erfcC, mask);
2719
2720     res_erfc = res_erfc * expmx2;
2721
2722     // erfc(x<0) = 2-erfc(|x|)
2723     mask     = (x < setZero());
2724     res_erfc = blend(res_erfc, two - res_erfc, mask);
2725
2726     // Select erf() or erfc()
2727     res = blend(res_erfc, one - res_erf, mask_erf);
2728
2729     return res;
2730 }
2731
2732 /*! \brief SIMD double sin \& cos.
2733  *
2734  * \param x The argument to evaluate sin/cos for
2735  * \param[out] sinval Sin(x)
2736  * \param[out] cosval Cos(x)
2737  *
2738  * This version achieves close to machine precision, but for very large
2739  * magnitudes of the argument we inherently begin to lose accuracy due to the
2740  * argument reduction, despite using extended precision arithmetics internally.
2741  */
2742 static inline void gmx_simdcall sincos(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
2743 {
2744     // Constants to subtract Pi/4*x from y while minimizing precision loss
2745     const SimdDouble argred0(-2 * 0.78539816290140151978);
2746     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2747     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2748     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2749     const SimdDouble two_over_pi(2.0 / M_PI);
2750     const SimdDouble const_sin5(1.58938307283228937328511e-10);
2751     const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2752     const SimdDouble const_sin3(2.75573131776846360512547e-06);
2753     const SimdDouble const_sin2(-0.000198412698278911770864914);
2754     const SimdDouble const_sin1(0.0083333333333191845961746);
2755     const SimdDouble const_sin0(-0.166666666666666130709393);
2756
2757     const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2758     const SimdDouble const_cos6(2.08757471207040055479366e-09);
2759     const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2760     const SimdDouble const_cos4(2.48015872890001867311915e-05);
2761     const SimdDouble const_cos3(-0.00138888888888714019282329);
2762     const SimdDouble const_cos2(0.0416666666666665519592062);
2763     const SimdDouble half(0.5);
2764     const SimdDouble one(1.0);
2765     SimdDouble       ssign, csign;
2766     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
2767     SimdDBool        mask;
2768 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2769     const SimdDInt32 ione(1);
2770     const SimdDInt32 itwo(2);
2771     SimdDInt32       iy;
2772
2773     z  = x * two_over_pi;
2774     iy = cvtR2I(z);
2775     y  = round(z);
2776
2777     mask  = cvtIB2B((iy & ione) == setZero());
2778     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2779     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2780 #        else
2781     const SimdDouble quarter(0.25);
2782     const SimdDouble minusquarter(-0.25);
2783     SimdDouble       q;
2784     SimdDBool        m1, m2, m3;
2785
2786     /* The most obvious way to find the arguments quadrant in the unit circle
2787      * to calculate the sign is to use integer arithmetic, but that is not
2788      * present in all SIMD implementations. As an alternative, we have devised a
2789      * pure floating-point algorithm that uses truncation for argument reduction
2790      * so that we get a new value 0<=q<1 over the unit circle, and then
2791      * do floating-point comparisons with fractions. This is likely to be
2792      * slightly slower (~10%) due to the longer latencies of floating-point, so
2793      * we only use it when integer SIMD arithmetic is not present.
2794      */
2795     ssign = x;
2796     x     = abs(x);
2797     // It is critical that half-way cases are rounded down
2798     z = fma(x, two_over_pi, half);
2799     y = trunc(z);
2800     q = z * quarter;
2801     q = q - trunc(q);
2802     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2803      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2804      * This removes the 2*Pi periodicity without using any integer arithmetic.
2805      * First check if y had the value 2 or 3, set csign if true.
2806      */
2807     q = q - half;
2808     /* If we have logical operations we can work directly on the signbit, which
2809      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2810      * Thus, if you are altering defines to debug alternative code paths, the
2811      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2812      * active or inactive - you will get errors if only one is used.
2813      */
2814 #            if GMX_SIMD_HAVE_LOGICAL
2815     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2816     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2817     ssign = ssign ^ csign;
2818 #            else
2819     ssign = copysign(SimdDouble(1.0), ssign);
2820     csign = copysign(SimdDouble(1.0), q);
2821     csign = -csign;
2822     ssign = ssign * csign; // swap ssign if csign was set.
2823 #            endif
2824     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2825     m1   = (q < minusquarter);
2826     m2   = (setZero() <= q);
2827     m3   = (q < quarter);
2828     m2   = m2 && m3;
2829     mask = m1 || m2;
2830     // where mask is FALSE, swap sign.
2831     csign   = csign * blend(SimdDouble(-1.0), one, mask);
2832 #        endif
2833     x  = fma(y, argred0, x);
2834     x  = fma(y, argred1, x);
2835     x  = fma(y, argred2, x);
2836     x  = fma(y, argred3, x);
2837     x2 = x * x;
2838
2839     psin = fma(const_sin5, x2, const_sin4);
2840     psin = fma(psin, x2, const_sin3);
2841     psin = fma(psin, x2, const_sin2);
2842     psin = fma(psin, x2, const_sin1);
2843     psin = fma(psin, x2, const_sin0);
2844     psin = fma(psin, x2 * x, x);
2845
2846     pcos = fma(const_cos7, x2, const_cos6);
2847     pcos = fma(pcos, x2, const_cos5);
2848     pcos = fma(pcos, x2, const_cos4);
2849     pcos = fma(pcos, x2, const_cos3);
2850     pcos = fma(pcos, x2, const_cos2);
2851     pcos = fms(pcos, x2, half);
2852     pcos = fma(pcos, x2, one);
2853
2854     sss = blend(pcos, psin, mask);
2855     ccc = blend(psin, pcos, mask);
2856     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2857 #        if GMX_SIMD_HAVE_LOGICAL
2858     *sinval = sss ^ ssign;
2859     *cosval = ccc ^ csign;
2860 #        else
2861     *sinval = sss * ssign;
2862     *cosval = ccc * csign;
2863 #        endif
2864 }
2865
2866 /*! \brief SIMD double sin(x).
2867  *
2868  * \param x The argument to evaluate sin for
2869  * \result Sin(x)
2870  *
2871  * \attention Do NOT call both sin & cos if you need both results, since each of them
2872  * will then call \ref sincos and waste a factor 2 in performance.
2873  */
2874 static inline SimdDouble gmx_simdcall sin(SimdDouble x)
2875 {
2876     SimdDouble s, c;
2877     sincos(x, &s, &c);
2878     return s;
2879 }
2880
2881 /*! \brief SIMD double cos(x).
2882  *
2883  * \param x The argument to evaluate cos for
2884  * \result Cos(x)
2885  *
2886  * \attention Do NOT call both sin & cos if you need both results, since each of them
2887  * will then call \ref sincos and waste a factor 2 in performance.
2888  */
2889 static inline SimdDouble gmx_simdcall cos(SimdDouble x)
2890 {
2891     SimdDouble s, c;
2892     sincos(x, &s, &c);
2893     return c;
2894 }
2895
2896 /*! \brief SIMD double tan(x).
2897  *
2898  * \param x The argument to evaluate tan for
2899  * \result Tan(x)
2900  */
2901 static inline SimdDouble gmx_simdcall tan(SimdDouble x)
2902 {
2903     const SimdDouble argred0(-2 * 0.78539816290140151978);
2904     const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2905     const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2906     const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2907     const SimdDouble two_over_pi(2.0 / M_PI);
2908     const SimdDouble CT15(1.01419718511083373224408e-05);
2909     const SimdDouble CT14(-2.59519791585924697698614e-05);
2910     const SimdDouble CT13(5.23388081915899855325186e-05);
2911     const SimdDouble CT12(-3.05033014433946488225616e-05);
2912     const SimdDouble CT11(7.14707504084242744267497e-05);
2913     const SimdDouble CT10(8.09674518280159187045078e-05);
2914     const SimdDouble CT9(0.000244884931879331847054404);
2915     const SimdDouble CT8(0.000588505168743587154904506);
2916     const SimdDouble CT7(0.00145612788922812427978848);
2917     const SimdDouble CT6(0.00359208743836906619142924);
2918     const SimdDouble CT5(0.00886323944362401618113356);
2919     const SimdDouble CT4(0.0218694882853846389592078);
2920     const SimdDouble CT3(0.0539682539781298417636002);
2921     const SimdDouble CT2(0.133333333333125941821962);
2922     const SimdDouble CT1(0.333333333333334980164153);
2923     const SimdDouble minFloat(std::numeric_limits<float>::min());
2924
2925     SimdDouble x2, p, y, z;
2926     SimdDBool  m;
2927
2928 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2929     SimdDInt32 iy;
2930     SimdDInt32 ione(1);
2931
2932     z  = x * two_over_pi;
2933     iy = cvtR2I(z);
2934     y  = round(z);
2935     m  = cvtIB2B((iy & ione) == ione);
2936
2937     x = fma(y, argred0, x);
2938     x = fma(y, argred1, x);
2939     x = fma(y, argred2, x);
2940     x = fma(y, argred3, x);
2941     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2942 #        else
2943     const SimdDouble quarter(0.25);
2944     const SimdDouble half(0.5);
2945     const SimdDouble threequarter(0.75);
2946     const SimdDouble minFloat(std::numeric_limits<float>::min());
2947     SimdDouble       w, q;
2948     SimdDBool        m1, m2, m3;
2949
2950     w  = abs(x);
2951     z  = fma(w, two_over_pi, half);
2952     y  = trunc(z);
2953     q  = z * quarter;
2954     q  = q - trunc(q);
2955     m1 = (quarter <= q);
2956     m2 = (q < half);
2957     m3 = (threequarter <= q);
2958     m1 = m1 && m2;
2959     m  = m1 || m3;
2960     w  = fma(y, argred0, w);
2961     w  = fma(y, argred1, w);
2962     w  = fma(y, argred2, w);
2963     w  = fma(y, argred3, w);
2964
2965     w     = blend(w, -w, m);
2966     x     = w * copysign(SimdDouble(1.0), x);
2967 #        endif
2968     x2 = x * x;
2969     p  = fma(CT15, x2, CT14);
2970     p  = fma(p, x2, CT13);
2971     p  = fma(p, x2, CT12);
2972     p  = fma(p, x2, CT11);
2973     p  = fma(p, x2, CT10);
2974     p  = fma(p, x2, CT9);
2975     p  = fma(p, x2, CT8);
2976     p  = fma(p, x2, CT7);
2977     p  = fma(p, x2, CT6);
2978     p  = fma(p, x2, CT5);
2979     p  = fma(p, x2, CT4);
2980     p  = fma(p, x2, CT3);
2981     p  = fma(p, x2, CT2);
2982     p  = fma(p, x2, CT1);
2983     p  = fma(x2, p * x, x);
2984
2985     p = blend(p, maskzInv(p, m && (minFloat < abs(p))), m);
2986     return p;
2987 }
2988
2989 /*! \brief SIMD double asin(x).
2990  *
2991  * \param x The argument to evaluate asin for
2992  * \result Asin(x)
2993  */
2994 static inline SimdDouble gmx_simdcall asin(SimdDouble x)
2995 {
2996     // Same algorithm as cephes library
2997     const SimdDouble limit1(0.625);
2998     const SimdDouble limit2(1e-8);
2999     const SimdDouble one(1.0);
3000     const SimdDouble quarterpi(M_PI / 4.0);
3001     const SimdDouble morebits(6.123233995736765886130e-17);
3002
3003     const SimdDouble P5(4.253011369004428248960e-3);
3004     const SimdDouble P4(-6.019598008014123785661e-1);
3005     const SimdDouble P3(5.444622390564711410273e0);
3006     const SimdDouble P2(-1.626247967210700244449e1);
3007     const SimdDouble P1(1.956261983317594739197e1);
3008     const SimdDouble P0(-8.198089802484824371615e0);
3009
3010     const SimdDouble Q4(-1.474091372988853791896e1);
3011     const SimdDouble Q3(7.049610280856842141659e1);
3012     const SimdDouble Q2(-1.471791292232726029859e2);
3013     const SimdDouble Q1(1.395105614657485689735e2);
3014     const SimdDouble Q0(-4.918853881490881290097e1);
3015
3016     const SimdDouble R4(2.967721961301243206100e-3);
3017     const SimdDouble R3(-5.634242780008963776856e-1);
3018     const SimdDouble R2(6.968710824104713396794e0);
3019     const SimdDouble R1(-2.556901049652824852289e1);
3020     const SimdDouble R0(2.853665548261061424989e1);
3021
3022     const SimdDouble S3(-2.194779531642920639778e1);
3023     const SimdDouble S2(1.470656354026814941758e2);
3024     const SimdDouble S1(-3.838770957603691357202e2);
3025     const SimdDouble S0(3.424398657913078477438e2);
3026
3027     SimdDouble xabs;
3028     SimdDouble zz, ww, z, q, w, zz2, ww2;
3029     SimdDouble PA, PB;
3030     SimdDouble QA, QB;
3031     SimdDouble RA, RB;
3032     SimdDouble SA, SB;
3033     SimdDouble nom, denom;
3034     SimdDBool  mask, mask2;
3035
3036     xabs = abs(x);
3037
3038     mask = (limit1 < xabs);
3039
3040     zz  = one - xabs;
3041     ww  = xabs * xabs;
3042     zz2 = zz * zz;
3043     ww2 = ww * ww;
3044
3045     // R
3046     RA = fma(R4, zz2, R2);
3047     RB = fma(R3, zz2, R1);
3048     RA = fma(RA, zz2, R0);
3049     RA = fma(RB, zz, RA);
3050
3051     // S, SA = zz2
3052     SB = fma(S3, zz2, S1);
3053     SA = zz2 + S2;
3054     SA = fma(SA, zz2, S0);
3055     SA = fma(SB, zz, SA);
3056
3057     // P
3058     PA = fma(P5, ww2, P3);
3059     PB = fma(P4, ww2, P2);
3060     PA = fma(PA, ww2, P1);
3061     PB = fma(PB, ww2, P0);
3062     PA = fma(PA, ww, PB);
3063
3064     // Q, QA = ww2
3065     QB = fma(Q4, ww2, Q2);
3066     QA = ww2 + Q3;
3067     QA = fma(QA, ww2, Q1);
3068     QB = fma(QB, ww2, Q0);
3069     QA = fma(QA, ww, QB);
3070
3071     RA = RA * zz;
3072     PA = PA * ww;
3073
3074     nom   = blend(PA, RA, mask);
3075     denom = blend(QA, SA, mask);
3076
3077     mask2 = (limit2 < xabs);
3078     q     = nom * maskzInv(denom, mask2);
3079
3080     zz = zz + zz;
3081     zz = sqrt(zz);
3082     z  = quarterpi - zz;
3083     zz = fms(zz, q, morebits);
3084     z  = z - zz;
3085     z  = z + quarterpi;
3086
3087     w = xabs * q;
3088     w = w + xabs;
3089
3090     z = blend(w, z, mask);
3091
3092     z = blend(xabs, z, mask2);
3093
3094     z = copysign(z, x);
3095
3096     return z;
3097 }
3098
3099 /*! \brief SIMD double acos(x).
3100  *
3101  * \param x The argument to evaluate acos for
3102  * \result Acos(x)
3103  */
3104 static inline SimdDouble gmx_simdcall acos(SimdDouble x)
3105 {
3106     const SimdDouble one(1.0);
3107     const SimdDouble half(0.5);
3108     const SimdDouble quarterpi0(7.85398163397448309616e-1);
3109     const SimdDouble quarterpi1(6.123233995736765886130e-17);
3110
3111     SimdDBool  mask1;
3112     SimdDouble z, z1, z2;
3113
3114     mask1 = (half < x);
3115     z1    = half * (one - x);
3116     z1    = sqrt(z1);
3117     z     = blend(x, z1, mask1);
3118
3119     z = asin(z);
3120
3121     z1 = z + z;
3122
3123     z2 = quarterpi0 - z;
3124     z2 = z2 + quarterpi1;
3125     z2 = z2 + quarterpi0;
3126
3127     z = blend(z2, z1, mask1);
3128
3129     return z;
3130 }
3131
3132 /*! \brief SIMD double asin(x).
3133  *
3134  * \param x The argument to evaluate atan for
3135  * \result Atan(x), same argument/value range as standard math library.
3136  */
3137 static inline SimdDouble gmx_simdcall atan(SimdDouble x)
3138 {
3139     // Same algorithm as cephes library
3140     const SimdDouble limit1(0.66);
3141     const SimdDouble limit2(2.41421356237309504880);
3142     const SimdDouble quarterpi(M_PI / 4.0);
3143     const SimdDouble halfpi(M_PI / 2.0);
3144     const SimdDouble mone(-1.0);
3145     const SimdDouble morebits1(0.5 * 6.123233995736765886130E-17);
3146     const SimdDouble morebits2(6.123233995736765886130E-17);
3147
3148     const SimdDouble P4(-8.750608600031904122785E-1);
3149     const SimdDouble P3(-1.615753718733365076637E1);
3150     const SimdDouble P2(-7.500855792314704667340E1);
3151     const SimdDouble P1(-1.228866684490136173410E2);
3152     const SimdDouble P0(-6.485021904942025371773E1);
3153
3154     const SimdDouble Q4(2.485846490142306297962E1);
3155     const SimdDouble Q3(1.650270098316988542046E2);
3156     const SimdDouble Q2(4.328810604912902668951E2);
3157     const SimdDouble Q1(4.853903996359136964868E2);
3158     const SimdDouble Q0(1.945506571482613964425E2);
3159
3160     SimdDouble y, xabs, t1, t2;
3161     SimdDouble z, z2;
3162     SimdDouble P_A, P_B, Q_A, Q_B;
3163     SimdDBool  mask1, mask2;
3164
3165     xabs = abs(x);
3166
3167     mask1 = (limit1 < xabs);
3168     mask2 = (limit2 < xabs);
3169
3170     t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
3171     t2 = mone * maskzInv(xabs, mask2);
3172
3173     y    = selectByMask(quarterpi, mask1);
3174     y    = blend(y, halfpi, mask2);
3175     xabs = blend(xabs, t1, mask1);
3176     xabs = blend(xabs, t2, mask2);
3177
3178     z  = xabs * xabs;
3179     z2 = z * z;
3180
3181     P_A = fma(P4, z2, P2);
3182     P_B = fma(P3, z2, P1);
3183     P_A = fma(P_A, z2, P0);
3184     P_A = fma(P_B, z, P_A);
3185
3186     // Q_A = z2
3187     Q_B = fma(Q4, z2, Q2);
3188     Q_A = z2 + Q3;
3189     Q_A = fma(Q_A, z2, Q1);
3190     Q_B = fma(Q_B, z2, Q0);
3191     Q_A = fma(Q_A, z, Q_B);
3192
3193     z = z * P_A;
3194     z = z * inv(Q_A);
3195     z = fma(z, xabs, xabs);
3196
3197     t1 = selectByMask(morebits1, mask1);
3198     t1 = blend(t1, morebits2, mask2);
3199
3200     z = z + t1;
3201     y = y + z;
3202
3203     y = copysign(y, x);
3204
3205     return y;
3206 }
3207
3208 /*! \brief SIMD double atan2(y,x).
3209  *
3210  * \param y Y component of vector, any quartile
3211  * \param x X component of vector, any quartile
3212  * \result Atan(y,x), same argument/value range as standard math library.
3213  *
3214  * \note This routine should provide correct results for all finite
3215  * non-zero or positive-zero arguments. However, negative zero arguments will
3216  * be treated as positive zero, which means the return value will deviate from
3217  * the standard math library atan2(y,x) for those cases. That should not be
3218  * of any concern in Gromacs, and in particular it will not affect calculations
3219  * of angles from vectors.
3220  */
3221 static inline SimdDouble gmx_simdcall atan2(SimdDouble y, SimdDouble x)
3222 {
3223     const SimdDouble pi(M_PI);
3224     const SimdDouble halfpi(M_PI / 2.0);
3225     const SimdDouble minFloat(std::numeric_limits<float>::min());
3226     SimdDouble       xinv, p, aoffset;
3227     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3228
3229     mask_xnz  = x != setZero();
3230     mask_ynz  = y != setZero();
3231     mask_xlt0 = (x < setZero());
3232     mask_ylt0 = (y < setZero());
3233
3234     aoffset = selectByNotMask(halfpi, mask_xnz);
3235     aoffset = selectByMask(aoffset, mask_ynz);
3236
3237     aoffset = blend(aoffset, pi, mask_xlt0);
3238     aoffset = blend(aoffset, -aoffset, mask_ylt0);
3239
3240     xinv = maskzInv(x, mask_xnz && (minFloat <= abs(x)));
3241     p    = y * xinv;
3242     p    = atan(p);
3243     p    = p + aoffset;
3244
3245     return p;
3246 }
3247
3248
3249 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3250  *
3251  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3252  *           interaction distance and beta the ewald splitting parameters.
3253  * \result Correction factor to coulomb force.
3254  *
3255  * This routine is meant to enable analytical evaluation of the
3256  * direct-space PME electrostatic force to avoid tables. For details, see the
3257  * single precision function.
3258  */
3259 static inline SimdDouble gmx_simdcall pmeForceCorrection(SimdDouble z2)
3260 {
3261     const SimdDouble FN10(-8.0072854618360083154e-14);
3262     const SimdDouble FN9(1.1859116242260148027e-11);
3263     const SimdDouble FN8(-8.1490406329798423616e-10);
3264     const SimdDouble FN7(3.4404793543907847655e-8);
3265     const SimdDouble FN6(-9.9471420832602741006e-7);
3266     const SimdDouble FN5(0.000020740315999115847456);
3267     const SimdDouble FN4(-0.00031991745139313364005);
3268     const SimdDouble FN3(0.0035074449373659008203);
3269     const SimdDouble FN2(-0.031750380176100813405);
3270     const SimdDouble FN1(0.13884101728898463426);
3271     const SimdDouble FN0(-0.75225277815249618847);
3272
3273     const SimdDouble FD5(0.000016009278224355026701);
3274     const SimdDouble FD4(0.00051055686934806966046);
3275     const SimdDouble FD3(0.0081803507497974289008);
3276     const SimdDouble FD2(0.077181146026670287235);
3277     const SimdDouble FD1(0.41543303143712535988);
3278     const SimdDouble FD0(1.0);
3279
3280     SimdDouble z4;
3281     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3282
3283     z4 = z2 * z2;
3284
3285     polyFD1 = fma(FD5, z4, FD3);
3286     polyFD1 = fma(polyFD1, z4, FD1);
3287     polyFD1 = polyFD1 * z2;
3288     polyFD0 = fma(FD4, z4, FD2);
3289     polyFD0 = fma(polyFD0, z4, FD0);
3290     polyFD0 = polyFD0 + polyFD1;
3291
3292     polyFD0 = inv(polyFD0);
3293
3294     polyFN0 = fma(FN10, z4, FN8);
3295     polyFN0 = fma(polyFN0, z4, FN6);
3296     polyFN0 = fma(polyFN0, z4, FN4);
3297     polyFN0 = fma(polyFN0, z4, FN2);
3298     polyFN0 = fma(polyFN0, z4, FN0);
3299     polyFN1 = fma(FN9, z4, FN7);
3300     polyFN1 = fma(polyFN1, z4, FN5);
3301     polyFN1 = fma(polyFN1, z4, FN3);
3302     polyFN1 = fma(polyFN1, z4, FN1);
3303     polyFN0 = fma(polyFN1, z2, polyFN0);
3304
3305
3306     return polyFN0 * polyFD0;
3307 }
3308
3309
3310 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3311  *
3312  * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3313  *           interaction distance and beta the ewald splitting parameters.
3314  * \result Correction factor to coulomb force.
3315  *
3316  * This routine is meant to enable analytical evaluation of the
3317  * direct-space PME electrostatic potential to avoid tables. For details, see the
3318  * single precision function.
3319  */
3320 static inline SimdDouble gmx_simdcall pmePotentialCorrection(SimdDouble z2)
3321 {
3322     const SimdDouble VN9(-9.3723776169321855475e-13);
3323     const SimdDouble VN8(1.2280156762674215741e-10);
3324     const SimdDouble VN7(-7.3562157912251309487e-9);
3325     const SimdDouble VN6(2.6215886208032517509e-7);
3326     const SimdDouble VN5(-4.9532491651265819499e-6);
3327     const SimdDouble VN4(0.00025907400778966060389);
3328     const SimdDouble VN3(0.0010585044856156469792);
3329     const SimdDouble VN2(0.045247661136833092885);
3330     const SimdDouble VN1(0.11643931522926034421);
3331     const SimdDouble VN0(1.1283791671726767970);
3332
3333     const SimdDouble VD5(0.000021784709867336150342);
3334     const SimdDouble VD4(0.00064293662010911388448);
3335     const SimdDouble VD3(0.0096311444822588683504);
3336     const SimdDouble VD2(0.085608012351550627051);
3337     const SimdDouble VD1(0.43652499166614811084);
3338     const SimdDouble VD0(1.0);
3339
3340     SimdDouble z4;
3341     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3342
3343     z4 = z2 * z2;
3344
3345     polyVD1 = fma(VD5, z4, VD3);
3346     polyVD0 = fma(VD4, z4, VD2);
3347     polyVD1 = fma(polyVD1, z4, VD1);
3348     polyVD0 = fma(polyVD0, z4, VD0);
3349     polyVD0 = fma(polyVD1, z2, polyVD0);
3350
3351     polyVD0 = inv(polyVD0);
3352
3353     polyVN1 = fma(VN9, z4, VN7);
3354     polyVN0 = fma(VN8, z4, VN6);
3355     polyVN1 = fma(polyVN1, z4, VN5);
3356     polyVN0 = fma(polyVN0, z4, VN4);
3357     polyVN1 = fma(polyVN1, z4, VN3);
3358     polyVN0 = fma(polyVN0, z4, VN2);
3359     polyVN1 = fma(polyVN1, z4, VN1);
3360     polyVN0 = fma(polyVN0, z4, VN0);
3361     polyVN0 = fma(polyVN1, z2, polyVN0);
3362
3363     return polyVN0 * polyVD0;
3364 }
3365
3366 /*! \} */
3367
3368
3369 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3370  *
3371  *  \note In some cases we do not need full double accuracy of individual
3372  *        SIMD math functions, although the data is stored in double precision
3373  *        SIMD registers. This might be the case for special algorithms, or
3374  *        if the architecture does not support single precision.
3375  *        Since the full double precision evaluation of math functions
3376  *        typically require much more expensive polynomial approximations
3377  *        these functions implement the algorithms used in the single precision
3378  *        SIMD math functions, but they operate on double precision
3379  *        SIMD variables.
3380  *
3381  *  \{
3382  */
3383
3384 /*********************************************************************
3385  * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3386  *********************************************************************/
3387
3388 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3389  *
3390  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3391  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3392  *           For the single precision implementation this is obviously always
3393  *           true for positive values, but for double precision it adds an
3394  *           extra restriction since the first lookup step might have to be
3395  *           performed in single precision on some architectures. Note that the
3396  *           responsibility for checking falls on you - this routine does not
3397  *           check arguments.
3398  *
3399  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
3400  */
3401 static inline SimdDouble gmx_simdcall invsqrtSingleAccuracy(SimdDouble x)
3402 {
3403     SimdDouble lu = rsqrt(x);
3404 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3405     lu = rsqrtIter(lu, x);
3406 #        endif
3407 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3408     lu = rsqrtIter(lu, x);
3409 #        endif
3410 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3411     lu = rsqrtIter(lu, x);
3412 #        endif
3413     return lu;
3414 }
3415
3416 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3417  *
3418  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3419  *  Illegal values in the masked-out elements will not lead to
3420  *  floating-point exceptions.
3421  *
3422  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3423  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3424  *           For the single precision implementation this is obviously always
3425  *           true for positive values, but for double precision it adds an
3426  *           extra restriction since the first lookup step might have to be
3427  *           performed in single precision on some architectures. Note that the
3428  *           responsibility for checking falls on you - this routine does not
3429  *           check arguments.
3430  *
3431  *  \param m Mask
3432  *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
3433  *          entry was not masked, and 0.0 for masked-out entries.
3434  */
3435 static inline SimdDouble maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
3436 {
3437     SimdDouble lu = maskzRsqrt(x, m);
3438 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3439     lu = rsqrtIter(lu, x);
3440 #        endif
3441 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3442     lu = rsqrtIter(lu, x);
3443 #        endif
3444 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3445     lu = rsqrtIter(lu, x);
3446 #        endif
3447     return lu;
3448 }
3449
3450 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3451  *
3452  * \param x0  First set of arguments, x0 must be in single range (see below).
3453  * \param x1  Second set of arguments, x1 must be in single range (see below).
3454  * \param[out] out0  Result 1/sqrt(x0)
3455  * \param[out] out1  Result 1/sqrt(x1)
3456  *
3457  *  In particular for double precision we can sometimes calculate square root
3458  *  pairs slightly faster by using single precision until the very last step.
3459  *
3460  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3461  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
3462  *       For the single precision implementation this is obviously always
3463  *       true for positive values, but for double precision it adds an
3464  *       extra restriction since the first lookup step might have to be
3465  *       performed in single precision on some architectures. Note that the
3466  *       responsibility for checking falls on you - this routine does not
3467  *       check arguments.
3468  */
3469 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdDouble  x0,
3470                                                           SimdDouble  x1,
3471                                                           SimdDouble* out0,
3472                                                           SimdDouble* out1)
3473 {
3474 #        if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3475                 && (GMX_SIMD_RSQRT_BITS < 22)
3476     SimdFloat  xf  = cvtDD2F(x0, x1);
3477     SimdFloat  luf = rsqrt(xf);
3478     SimdDouble lu0, lu1;
3479     // Intermediate target is single - mantissa+1 bits
3480 #            if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3481     luf = rsqrtIter(luf, xf);
3482 #            endif
3483 #            if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3484     luf = rsqrtIter(luf, xf);
3485 #            endif
3486 #            if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3487     luf = rsqrtIter(luf, xf);
3488 #            endif
3489     cvtF2DD(luf, &lu0, &lu1);
3490     // We now have single-precision accuracy values in lu0/lu1
3491     *out0 = lu0;
3492     *out1 = lu1;
3493 #        else
3494     *out0 = invsqrtSingleAccuracy(x0);
3495     *out1 = invsqrtSingleAccuracy(x1);
3496 #        endif
3497 }
3498
3499 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3500  *
3501  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3502  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3503  *           For the single precision implementation this is obviously always
3504  *           true for positive values, but for double precision it adds an
3505  *           extra restriction since the first lookup step might have to be
3506  *           performed in single precision on some architectures. Note that the
3507  *           responsibility for checking falls on you - this routine does not
3508  *           check arguments.
3509  *
3510  *  \return 1/x. Result is undefined if your argument was invalid.
3511  */
3512 static inline SimdDouble gmx_simdcall invSingleAccuracy(SimdDouble x)
3513 {
3514     SimdDouble lu = rcp(x);
3515 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3516     lu = rcpIter(lu, x);
3517 #        endif
3518 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3519     lu = rcpIter(lu, x);
3520 #        endif
3521 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3522     lu = rcpIter(lu, x);
3523 #        endif
3524     return lu;
3525 }
3526
3527 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3528  *
3529  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3530  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
3531  *           For the single precision implementation this is obviously always
3532  *           true for positive values, but for double precision it adds an
3533  *           extra restriction since the first lookup step might have to be
3534  *           performed in single precision on some architectures. Note that the
3535  *           responsibility for checking falls on you - this routine does not
3536  *           check arguments.
3537  *
3538  *  \param m Mask
3539  *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3540  */
3541 static inline SimdDouble gmx_simdcall maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3542 {
3543     SimdDouble lu = maskzRcp(x, m);
3544 #        if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3545     lu = rcpIter(lu, x);
3546 #        endif
3547 #        if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3548     lu = rcpIter(lu, x);
3549 #        endif
3550 #        if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3551     lu = rcpIter(lu, x);
3552 #        endif
3553     return lu;
3554 }
3555
3556
3557 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3558  *
3559  *  \copydetails sqrt(SimdFloat)
3560  */
3561 template<MathOptimization opt = MathOptimization::Safe>
3562 static inline SimdDouble gmx_simdcall sqrtSingleAccuracy(SimdDouble x)
3563 {
3564     if (opt == MathOptimization::Safe)
3565     {
3566         SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3567         return res * x;
3568     }
3569     else
3570     {
3571         return x * invsqrtSingleAccuracy(x);
3572     }
3573 }
3574
3575 /*! \brief Cube root for SIMD doubles, single accuracy.
3576  *
3577  * \param x      Argument to calculate cube root of. Can be negative or zero,
3578  *               but NaN or Inf values are not supported. Denormal values will
3579  *               be treated as 0.0.
3580  * \return       Cube root of x.
3581  */
3582 static inline SimdDouble gmx_simdcall cbrtSingleAccuracy(SimdDouble x)
3583 {
3584     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3585     const SimdDouble minDouble(std::numeric_limits<double>::min());
3586     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3587     // Use the divided value as original constant to avoid division warnings.
3588     const std::int32_t offsetDiv3(341);
3589     const SimdDouble   c2(-0.191502161678719066);
3590     const SimdDouble   c1(0.697570460207922770);
3591     const SimdDouble   c0(0.492659620528969547);
3592     const SimdDouble   one(1.0);
3593     const SimdDouble   two(2.0);
3594     const SimdDouble   three(3.0);
3595     const SimdDouble   oneThird(1.0 / 3.0);
3596     const SimdDouble   cbrt2(1.2599210498948731648);
3597     const SimdDouble   sqrCbrt2(1.5874010519681994748);
3598
3599     // See the single precision routines for documentation of the algorithm
3600
3601     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3602     SimdDouble xAbs     = andNot(signBit, x);    // select everthing but the sign bit => abs(x)
3603     SimdDBool  xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
3604
3605     SimdDInt32 exponent;
3606     SimdDouble y        = frexp(xAbs, &exponent);
3607     SimdDouble z        = fma(fma(y, c2, c1), y, c0);
3608     SimdDouble w        = z * z * z;
3609     SimdDouble nom      = z * fma(two, y, w);
3610     SimdDouble invDenom = inv(fma(two, w, y));
3611
3612     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3613     SimdDouble offsetExpDiv3 =
3614             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3615     SimdDInt32 expDiv3   = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
3616     SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
3617     SimdDouble factor    = blend(one, cbrt2, SimdDouble(0.5) < remainder);
3618     factor               = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
3619     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3620     SimdDouble result    = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
3621     return result;
3622 }
3623
3624 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3625  *
3626  * \param x      Argument to calculate cube root of. Can be positive or
3627  *               negative, but the magnitude cannot be lower than
3628  *               the smallest normal number.
3629  * \return       Cube root of x. Undefined for values that don't
3630  *               fulfill the restriction of abs(x) > minDouble.
3631  */
3632 static inline SimdDouble gmx_simdcall invcbrtSingleAccuracy(SimdDouble x)
3633 {
3634     const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3635     // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3636     // Use the divided value as original constant to avoid division warnings.
3637     const std::int32_t offsetDiv3(341);
3638     const SimdDouble   c2(-0.191502161678719066);
3639     const SimdDouble   c1(0.697570460207922770);
3640     const SimdDouble   c0(0.492659620528969547);
3641     const SimdDouble   one(1.0);
3642     const SimdDouble   two(2.0);
3643     const SimdDouble   three(3.0);
3644     const SimdDouble   oneThird(1.0 / 3.0);
3645     const SimdDouble   invCbrt2(1.0 / 1.2599210498948731648);
3646     const SimdDouble   invSqrCbrt2(1.0F / 1.5874010519681994748);
3647
3648     // See the single precision routines for documentation of the algorithm
3649
3650     SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3651     SimdDouble xAbs     = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3652
3653     SimdDInt32 exponent;
3654     SimdDouble y         = frexp(xAbs, &exponent);
3655     SimdDouble z         = fma(fma(y, c2, c1), y, c0);
3656     SimdDouble w         = z * z * z;
3657     SimdDouble nom       = fma(two, w, y);
3658     SimdDouble invDenom  = inv(z * fma(two, y, w));
3659     SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3660     SimdDouble offsetExpDiv3 =
3661             trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3662     SimdDInt32 expDiv3   = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
3663     SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
3664     SimdDouble factor    = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
3665     factor               = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
3666     SimdDouble fraction  = (nom * invDenom * factor) ^ xSignBit;
3667     SimdDouble result    = ldexp(fraction, expDiv3);
3668     return result;
3669 }
3670
3671 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3672  *
3673  * \param x Argument, should be >0.
3674  * \result The base 2 logarithm of x. Undefined if argument is invalid.
3675  */
3676 static inline SimdDouble gmx_simdcall log2SingleAccuracy(SimdDouble x)
3677 {
3678 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3679     return log(x) * SimdDouble(std::log2(std::exp(1.0)));
3680 #        else
3681     const SimdDouble one(1.0);
3682     const SimdDouble two(2.0);
3683     const SimdDouble sqrt2(std::sqrt(2.0));
3684     const SimdDouble CL9(0.342149508897807708152F);
3685     const SimdDouble CL7(0.411570606888219447939F);
3686     const SimdDouble CL5(0.577085979152320294183F);
3687     const SimdDouble CL3(0.961796550607099898222F);
3688     const SimdDouble CL1(2.885390081777926774009F);
3689     SimdDouble       fexp, x2, p;
3690     SimdDInt32       iexp;
3691     SimdDBool        mask;
3692
3693     x    = frexp(x, &iexp);
3694     fexp = cvtI2R(iexp);
3695
3696     mask = (x < sqrt2);
3697     // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3698     fexp = fexp - selectByMask(one, mask);
3699     x    = x * blend(one, two, mask);
3700
3701     x  = (x - one) * invSingleAccuracy(x + one);
3702     x2 = x * x;
3703
3704     p = fma(CL9, x2, CL7);
3705     p = fma(p, x2, CL5);
3706     p = fma(p, x2, CL3);
3707     p = fma(p, x2, CL1);
3708     p = fma(p, x, fexp);
3709
3710     return p;
3711 #        endif
3712 }
3713
3714 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3715  *
3716  * \param x Argument, should be >0.
3717  * \result The natural logarithm of x. Undefined if argument is invalid.
3718  */
3719 static inline SimdDouble gmx_simdcall logSingleAccuracy(SimdDouble x)
3720 {
3721 #        if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3722     return log(x);
3723 #        else
3724     const SimdDouble one(1.0);
3725     const SimdDouble two(2.0);
3726     const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
3727     const SimdDouble corr(0.693147180559945286226764);
3728     const SimdDouble CL9(0.2371599674224853515625);
3729     const SimdDouble CL7(0.285279005765914916992188);
3730     const SimdDouble CL5(0.400005519390106201171875);
3731     const SimdDouble CL3(0.666666567325592041015625);
3732     const SimdDouble CL1(2.0);
3733     SimdDouble       fexp, x2, p;
3734     SimdDInt32       iexp;
3735     SimdDBool        mask;
3736
3737     x    = frexp(x, &iexp);
3738     fexp = cvtI2R(iexp);
3739
3740     mask = x < invsqrt2;
3741     // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3742     fexp = fexp - selectByMask(one, mask);
3743     x    = x * blend(one, two, mask);
3744
3745     x  = (x - one) * invSingleAccuracy(x + one);
3746     x2 = x * x;
3747
3748     p = fma(CL9, x2, CL7);
3749     p = fma(p, x2, CL5);
3750     p = fma(p, x2, CL3);
3751     p = fma(p, x2, CL1);
3752     p = fma(p, x, corr * fexp);
3753
3754     return p;
3755 #        endif
3756 }
3757
3758 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3759  *
3760  * \copydetails exp2(SimdFloat)
3761  */
3762 template<MathOptimization opt = MathOptimization::Safe>
3763 static inline SimdDouble gmx_simdcall exp2SingleAccuracy(SimdDouble x)
3764 {
3765 #        if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3766     return exp2(x);
3767 #        else
3768     const SimdDouble CC6(0.0001534581200287996416911311);
3769     const SimdDouble CC5(0.001339993121934088894618990);
3770     const SimdDouble CC4(0.009618488957115180159497841);
3771     const SimdDouble CC3(0.05550328776964726865751735);
3772     const SimdDouble CC2(0.2402264689063408646490722);
3773     const SimdDouble CC1(0.6931472057372680777553816);
3774     const SimdDouble one(1.0);
3775
3776     SimdDouble intpart;
3777     SimdDouble p;
3778     SimdDInt32 ix;
3779
3780     // Large negative values are valid arguments to exp2(), so there are two
3781     // things we need to account for:
3782     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3783     //    zero and we can no longer multiply with it. There are special IEEE
3784     //    formats to handle this range, but for now we have to accept that
3785     //    we cannot handle those arguments. If input value becomes even more
3786     //    negative, it will start to loop and we would end up with invalid
3787     //    exponents. Thus, we need to limit or mask this.
3788     // 2. For VERY large negative values, we will have problems that the
3789     //    subtraction to get the fractional part loses accuracy, and then we
3790     //    can end up with overflows in the polynomial.
3791     //
3792     // For now, we handle this by forwarding the math optimization setting to
3793     // ldexp, where the routine will return zero for very small arguments.
3794     //
3795     // However, before doing that we need to make sure we do not call cvtR2I
3796     // with an argument that is so negative it cannot be converted to an integer.
3797     if (opt == MathOptimization::Safe)
3798     {
3799         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3800     }
3801
3802     ix      = cvtR2I(x);
3803     intpart = round(x);
3804     x       = x - intpart;
3805
3806     p = fma(CC6, x, CC5);
3807     p = fma(p, x, CC4);
3808     p = fma(p, x, CC3);
3809     p = fma(p, x, CC2);
3810     p = fma(p, x, CC1);
3811     p = fma(p, x, one);
3812     x = ldexp<opt>(p, ix);
3813
3814     return x;
3815 #        endif
3816 }
3817
3818
3819 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3820  *
3821  * \copydetails exp(SimdFloat)
3822  */
3823 template<MathOptimization opt = MathOptimization::Safe>
3824 static inline SimdDouble gmx_simdcall expSingleAccuracy(SimdDouble x)
3825 {
3826 #        if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3827     return exp(x);
3828 #        else
3829     const SimdDouble argscale(1.44269504088896341);
3830     // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3831     const SimdDouble smallArgLimit(-709.0895657128);
3832     const SimdDouble invargscale(-0.69314718055994528623);
3833     const SimdDouble CC4(0.00136324646882712841033936);
3834     const SimdDouble CC3(0.00836596917361021041870117);
3835     const SimdDouble CC2(0.0416710823774337768554688);
3836     const SimdDouble CC1(0.166665524244308471679688);
3837     const SimdDouble CC0(0.499999850988388061523438);
3838     const SimdDouble one(1.0);
3839     SimdDouble       intpart;
3840     SimdDouble       y, p;
3841     SimdDInt32       iy;
3842
3843     // Large negative values are valid arguments to exp2(), so there are two
3844     // things we need to account for:
3845     // 1. When the exponents reaches -1023, the (biased) exponent field will be
3846     //    zero and we can no longer multiply with it. There are special IEEE
3847     //    formats to handle this range, but for now we have to accept that
3848     //    we cannot handle those arguments. If input value becomes even more
3849     //    negative, it will start to loop and we would end up with invalid
3850     //    exponents. Thus, we need to limit or mask this.
3851     // 2. For VERY large negative values, we will have problems that the
3852     //    subtraction to get the fractional part loses accuracy, and then we
3853     //    can end up with overflows in the polynomial.
3854     //
3855     // For now, we handle this by forwarding the math optimization setting to
3856     // ldexp, where the routine will return zero for very small arguments.
3857     //
3858     // However, before doing that we need to make sure we do not call cvtR2I
3859     // with an argument that is so negative it cannot be converted to an integer
3860     // after being multiplied by argscale.
3861
3862     if (opt == MathOptimization::Safe)
3863     {
3864         x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
3865     }
3866
3867     y = x * argscale;
3868
3869     iy      = cvtR2I(y);
3870     intpart = round(y); // use same rounding algorithm here
3871
3872     // Extended precision arithmetics not needed since
3873     // we have double precision and only need single accuracy.
3874     x = fma(invargscale, intpart, x);
3875
3876     p = fma(CC4, x, CC3);
3877     p = fma(p, x, CC2);
3878     p = fma(p, x, CC1);
3879     p = fma(p, x, CC0);
3880     p = fma(x * x, p, x);
3881     p = p + one;
3882     x = ldexp<opt>(p, iy);
3883     return x;
3884 #        endif
3885 }
3886
3887 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3888  *
3889  * This returns x^y for SIMD values.
3890  *
3891  * \tparam opt If this is changed from the default (safe) into the unsafe
3892  *             option, there are no guarantees about correct results for x==0.
3893  *
3894  * \param x Base.
3895  *
3896  * \param y exponent.
3897
3898  * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3899  *         depending on the underlying implementation. If unsafe optimizations
3900  *         are enabled, this is also true for x==0.
3901  *
3902  * \warning You cannot rely on this implementation returning inf for arguments
3903  *          that cause overflow. If you have some very large
3904  *          values and need to rely on getting a valid numerical output,
3905  *          take the minimum of your variable and the largest valid argument
3906  *          before calling this routine.
3907  */
3908 template<MathOptimization opt = MathOptimization::Safe>
3909 static inline SimdDouble gmx_simdcall powSingleAccuracy(SimdDouble x, SimdDouble y)
3910 {
3911     SimdDouble xcorr;
3912
3913     if (opt == MathOptimization::Safe)
3914     {
3915         xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
3916     }
3917     else
3918     {
3919         xcorr = x;
3920     }
3921
3922     SimdDouble result = exp2SingleAccuracy<opt>(y * log2SingleAccuracy(xcorr));
3923
3924     if (opt == MathOptimization::Safe)
3925     {
3926         // if x==0 and y>0 we explicitly set the result to 0.0
3927         // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3928         result = blend(result, setZero(), x == setZero() && setZero() < y);
3929     }
3930
3931     return result;
3932 }
3933
3934 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3935  *
3936  * \param x The value to calculate erf(x) for.
3937  * \result erf(x)
3938  *
3939  * This routine achieves very close to single precision, but we do not care about
3940  * the last bit or the subnormal result range.
3941  */
3942 static inline SimdDouble gmx_simdcall erfSingleAccuracy(SimdDouble x)
3943 {
3944     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3945     const SimdDouble CA6(7.853861353153693e-5);
3946     const SimdDouble CA5(-8.010193625184903e-4);
3947     const SimdDouble CA4(5.188327685732524e-3);
3948     const SimdDouble CA3(-2.685381193529856e-2);
3949     const SimdDouble CA2(1.128358514861418e-1);
3950     const SimdDouble CA1(-3.761262582423300e-1);
3951     const SimdDouble CA0(1.128379165726710);
3952     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3953     const SimdDouble CB9(-0.0018629930017603923);
3954     const SimdDouble CB8(0.003909821287598495);
3955     const SimdDouble CB7(-0.0052094582210355615);
3956     const SimdDouble CB6(0.005685614362160572);
3957     const SimdDouble CB5(-0.0025367682853477272);
3958     const SimdDouble CB4(-0.010199799682318782);
3959     const SimdDouble CB3(0.04369575504816542);
3960     const SimdDouble CB2(-0.11884063474674492);
3961     const SimdDouble CB1(0.2732120154030589);
3962     const SimdDouble CB0(0.42758357702025784);
3963     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3964     const SimdDouble CC10(-0.0445555913112064);
3965     const SimdDouble CC9(0.21376355144663348);
3966     const SimdDouble CC8(-0.3473187200259257);
3967     const SimdDouble CC7(0.016690861551248114);
3968     const SimdDouble CC6(0.7560973182491192);
3969     const SimdDouble CC5(-1.2137903600145787);
3970     const SimdDouble CC4(0.8411872321232948);
3971     const SimdDouble CC3(-0.08670413896296343);
3972     const SimdDouble CC2(-0.27124782687240334);
3973     const SimdDouble CC1(-0.0007502488047806069);
3974     const SimdDouble CC0(0.5642114853803148);
3975     const SimdDouble one(1.0);
3976     const SimdDouble two(2.0);
3977
3978     SimdDouble x2, x4, y;
3979     SimdDouble t, t2, w, w2;
3980     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3981     SimdDouble expmx2;
3982     SimdDouble res_erf, res_erfc, res;
3983     SimdDBool  mask, msk_erf;
3984
3985     // Calculate erf()
3986     x2 = x * x;
3987     x4 = x2 * x2;
3988
3989     pA0 = fma(CA6, x4, CA4);
3990     pA1 = fma(CA5, x4, CA3);
3991     pA0 = fma(pA0, x4, CA2);
3992     pA1 = fma(pA1, x4, CA1);
3993     pA0 = pA0 * x4;
3994     pA0 = fma(pA1, x2, pA0);
3995     // Constant term must come last for precision reasons
3996     pA0 = pA0 + CA0;
3997
3998     res_erf = x * pA0;
3999
4000     // Calculate erfc
4001     y       = abs(x);
4002     msk_erf = (SimdDouble(0.75) <= y);
4003     t       = maskzInvSingleAccuracy(y, msk_erf);
4004     w       = t - one;
4005     t2      = t * t;
4006     w2      = w * w;
4007
4008     expmx2 = expSingleAccuracy(-y * y);
4009
4010     pB1 = fma(CB9, w2, CB7);
4011     pB0 = fma(CB8, w2, CB6);
4012     pB1 = fma(pB1, w2, CB5);
4013     pB0 = fma(pB0, w2, CB4);
4014     pB1 = fma(pB1, w2, CB3);
4015     pB0 = fma(pB0, w2, CB2);
4016     pB1 = fma(pB1, w2, CB1);
4017     pB0 = fma(pB0, w2, CB0);
4018     pB0 = fma(pB1, w, pB0);
4019
4020     pC0 = fma(CC10, t2, CC8);
4021     pC1 = fma(CC9, t2, CC7);
4022     pC0 = fma(pC0, t2, CC6);
4023     pC1 = fma(pC1, t2, CC5);
4024     pC0 = fma(pC0, t2, CC4);
4025     pC1 = fma(pC1, t2, CC3);
4026     pC0 = fma(pC0, t2, CC2);
4027     pC1 = fma(pC1, t2, CC1);
4028
4029     pC0 = fma(pC0, t2, CC0);
4030     pC0 = fma(pC1, t, pC0);
4031     pC0 = pC0 * t;
4032
4033     // Select pB0 or pC0 for erfc()
4034     mask     = (two < y);
4035     res_erfc = blend(pB0, pC0, mask);
4036     res_erfc = res_erfc * expmx2;
4037
4038     // erfc(x<0) = 2-erfc(|x|)
4039     mask     = (x < setZero());
4040     res_erfc = blend(res_erfc, two - res_erfc, mask);
4041
4042     // Select erf() or erfc()
4043     mask = (y < SimdDouble(0.75));
4044     res  = blend(one - res_erfc, res_erf, mask);
4045
4046     return res;
4047 }
4048
4049 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4050  *
4051  * \param x The value to calculate erfc(x) for.
4052  * \result erfc(x)
4053  *
4054  * This routine achieves singleprecision (bar the last bit) over most of the
4055  * input range, but for large arguments where the result is getting close
4056  * to the minimum representable numbers we accept slightly larger errors
4057  * (think results that are in the ballpark of 10^-30) since that is not
4058  * relevant for MD.
4059  */
4060 static inline SimdDouble gmx_simdcall erfcSingleAccuracy(SimdDouble x)
4061 {
4062     // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4063     const SimdDouble CA6(7.853861353153693e-5);
4064     const SimdDouble CA5(-8.010193625184903e-4);
4065     const SimdDouble CA4(5.188327685732524e-3);
4066     const SimdDouble CA3(-2.685381193529856e-2);
4067     const SimdDouble CA2(1.128358514861418e-1);
4068     const SimdDouble CA1(-3.761262582423300e-1);
4069     const SimdDouble CA0(1.128379165726710);
4070     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4071     const SimdDouble CB9(-0.0018629930017603923);
4072     const SimdDouble CB8(0.003909821287598495);
4073     const SimdDouble CB7(-0.0052094582210355615);
4074     const SimdDouble CB6(0.005685614362160572);
4075     const SimdDouble CB5(-0.0025367682853477272);
4076     const SimdDouble CB4(-0.010199799682318782);
4077     const SimdDouble CB3(0.04369575504816542);
4078     const SimdDouble CB2(-0.11884063474674492);
4079     const SimdDouble CB1(0.2732120154030589);
4080     const SimdDouble CB0(0.42758357702025784);
4081     // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4082     const SimdDouble CC10(-0.0445555913112064);
4083     const SimdDouble CC9(0.21376355144663348);
4084     const SimdDouble CC8(-0.3473187200259257);
4085     const SimdDouble CC7(0.016690861551248114);
4086     const SimdDouble CC6(0.7560973182491192);
4087     const SimdDouble CC5(-1.2137903600145787);
4088     const SimdDouble CC4(0.8411872321232948);
4089     const SimdDouble CC3(-0.08670413896296343);
4090     const SimdDouble CC2(-0.27124782687240334);
4091     const SimdDouble CC1(-0.0007502488047806069);
4092     const SimdDouble CC0(0.5642114853803148);
4093     const SimdDouble one(1.0);
4094     const SimdDouble two(2.0);
4095
4096     SimdDouble x2, x4, y;
4097     SimdDouble t, t2, w, w2;
4098     SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
4099     SimdDouble expmx2;
4100     SimdDouble res_erf, res_erfc, res;
4101     SimdDBool  mask, msk_erf;
4102
4103     // Calculate erf()
4104     x2 = x * x;
4105     x4 = x2 * x2;
4106
4107     pA0 = fma(CA6, x4, CA4);
4108     pA1 = fma(CA5, x4, CA3);
4109     pA0 = fma(pA0, x4, CA2);
4110     pA1 = fma(pA1, x4, CA1);
4111     pA1 = pA1 * x2;
4112     pA0 = fma(pA0, x4, pA1);
4113     // Constant term must come last for precision reasons
4114     pA0 = pA0 + CA0;
4115
4116     res_erf = x * pA0;
4117
4118     // Calculate erfc
4119     y       = abs(x);
4120     msk_erf = (SimdDouble(0.75) <= y);
4121     t       = maskzInvSingleAccuracy(y, msk_erf);
4122     w       = t - one;
4123     t2      = t * t;
4124     w2      = w * w;
4125
4126     expmx2 = expSingleAccuracy(-y * y);
4127
4128     pB1 = fma(CB9, w2, CB7);
4129     pB0 = fma(CB8, w2, CB6);
4130     pB1 = fma(pB1, w2, CB5);
4131     pB0 = fma(pB0, w2, CB4);
4132     pB1 = fma(pB1, w2, CB3);
4133     pB0 = fma(pB0, w2, CB2);
4134     pB1 = fma(pB1, w2, CB1);
4135     pB0 = fma(pB0, w2, CB0);
4136     pB0 = fma(pB1, w, pB0);
4137
4138     pC0 = fma(CC10, t2, CC8);
4139     pC1 = fma(CC9, t2, CC7);
4140     pC0 = fma(pC0, t2, CC6);
4141     pC1 = fma(pC1, t2, CC5);
4142     pC0 = fma(pC0, t2, CC4);
4143     pC1 = fma(pC1, t2, CC3);
4144     pC0 = fma(pC0, t2, CC2);
4145     pC1 = fma(pC1, t2, CC1);
4146
4147     pC0 = fma(pC0, t2, CC0);
4148     pC0 = fma(pC1, t, pC0);
4149     pC0 = pC0 * t;
4150
4151     // Select pB0 or pC0 for erfc()
4152     mask     = (two < y);
4153     res_erfc = blend(pB0, pC0, mask);
4154     res_erfc = res_erfc * expmx2;
4155
4156     // erfc(x<0) = 2-erfc(|x|)
4157     mask     = (x < setZero());
4158     res_erfc = blend(res_erfc, two - res_erfc, mask);
4159
4160     // Select erf() or erfc()
4161     mask = (y < SimdDouble(0.75));
4162     res  = blend(res_erfc, one - res_erf, mask);
4163
4164     return res;
4165 }
4166
4167 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4168  *
4169  * \param x The argument to evaluate sin/cos for
4170  * \param[out] sinval Sin(x)
4171  * \param[out] cosval Cos(x)
4172  */
4173 static inline void gmx_simdcall sinCosSingleAccuracy(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
4174 {
4175     // Constants to subtract Pi/4*x from y while minimizing precision loss
4176     const SimdDouble argred0(2 * 0.78539816290140151978);
4177     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4178     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4179     const SimdDouble two_over_pi(2.0 / M_PI);
4180     const SimdDouble const_sin2(-1.9515295891e-4);
4181     const SimdDouble const_sin1(8.3321608736e-3);
4182     const SimdDouble const_sin0(-1.6666654611e-1);
4183     const SimdDouble const_cos2(2.443315711809948e-5);
4184     const SimdDouble const_cos1(-1.388731625493765e-3);
4185     const SimdDouble const_cos0(4.166664568298827e-2);
4186
4187     const SimdDouble half(0.5);
4188     const SimdDouble one(1.0);
4189     SimdDouble       ssign, csign;
4190     SimdDouble       x2, y, z, psin, pcos, sss, ccc;
4191     SimdDBool        mask;
4192
4193 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4194     const SimdDInt32 ione(1);
4195     const SimdDInt32 itwo(2);
4196     SimdDInt32       iy;
4197
4198     z  = x * two_over_pi;
4199     iy = cvtR2I(z);
4200     y  = round(z);
4201
4202     mask  = cvtIB2B((iy & ione) == setZero());
4203     ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
4204     csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
4205 #        else
4206     const SimdDouble quarter(0.25);
4207     const SimdDouble minusquarter(-0.25);
4208     SimdDouble       q;
4209     SimdDBool        m1, m2, m3;
4210
4211     /* The most obvious way to find the arguments quadrant in the unit circle
4212      * to calculate the sign is to use integer arithmetic, but that is not
4213      * present in all SIMD implementations. As an alternative, we have devised a
4214      * pure floating-point algorithm that uses truncation for argument reduction
4215      * so that we get a new value 0<=q<1 over the unit circle, and then
4216      * do floating-point comparisons with fractions. This is likely to be
4217      * slightly slower (~10%) due to the longer latencies of floating-point, so
4218      * we only use it when integer SIMD arithmetic is not present.
4219      */
4220     ssign = x;
4221     x     = abs(x);
4222     // It is critical that half-way cases are rounded down
4223     z = fma(x, two_over_pi, half);
4224     y = trunc(z);
4225     q = z * quarter;
4226     q = q - trunc(q);
4227     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4228      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4229      * This removes the 2*Pi periodicity without using any integer arithmetic.
4230      * First check if y had the value 2 or 3, set csign if true.
4231      */
4232     q = q - half;
4233     /* If we have logical operations we can work directly on the signbit, which
4234      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4235      * Thus, if you are altering defines to debug alternative code paths, the
4236      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4237      * active or inactive - you will get errors if only one is used.
4238      */
4239 #            if GMX_SIMD_HAVE_LOGICAL
4240     ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
4241     csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
4242     ssign = ssign ^ csign;
4243 #            else
4244     ssign = copysign(SimdDouble(1.0), ssign);
4245     csign = copysign(SimdDouble(1.0), q);
4246     csign = -csign;
4247     ssign = ssign * csign; // swap ssign if csign was set.
4248 #            endif
4249     // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4250     m1   = (q < minusquarter);
4251     m2   = (setZero() <= q);
4252     m3   = (q < quarter);
4253     m2   = m2 && m3;
4254     mask = m1 || m2;
4255     // where mask is FALSE, swap sign.
4256     csign   = csign * blend(SimdDouble(-1.0), one, mask);
4257 #        endif
4258     x  = fnma(y, argred0, x);
4259     x  = fnma(y, argred1, x);
4260     x  = fnma(y, argred2, x);
4261     x2 = x * x;
4262
4263     psin = fma(const_sin2, x2, const_sin1);
4264     psin = fma(psin, x2, const_sin0);
4265     psin = fma(psin, x * x2, x);
4266     pcos = fma(const_cos2, x2, const_cos1);
4267     pcos = fma(pcos, x2, const_cos0);
4268     pcos = fms(pcos, x2, half);
4269     pcos = fma(pcos, x2, one);
4270
4271     sss = blend(pcos, psin, mask);
4272     ccc = blend(psin, pcos, mask);
4273     // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4274 #        if GMX_SIMD_HAVE_LOGICAL
4275     *sinval = sss ^ ssign;
4276     *cosval = ccc ^ csign;
4277 #        else
4278     *sinval = sss * ssign;
4279     *cosval = ccc * csign;
4280 #        endif
4281 }
4282
4283 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4284  *
4285  * \param x The argument to evaluate sin for
4286  * \result Sin(x)
4287  *
4288  * \attention Do NOT call both sin & cos if you need both results, since each of them
4289  * will then call \ref sincos and waste a factor 2 in performance.
4290  */
4291 static inline SimdDouble gmx_simdcall sinSingleAccuracy(SimdDouble x)
4292 {
4293     SimdDouble s, c;
4294     sinCosSingleAccuracy(x, &s, &c);
4295     return s;
4296 }
4297
4298 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4299  *
4300  * \param x The argument to evaluate cos for
4301  * \result Cos(x)
4302  *
4303  * \attention Do NOT call both sin & cos if you need both results, since each of them
4304  * will then call \ref sincos and waste a factor 2 in performance.
4305  */
4306 static inline SimdDouble gmx_simdcall cosSingleAccuracy(SimdDouble x)
4307 {
4308     SimdDouble s, c;
4309     sinCosSingleAccuracy(x, &s, &c);
4310     return c;
4311 }
4312
4313 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4314  *
4315  * \param x The argument to evaluate tan for
4316  * \result Tan(x)
4317  */
4318 static inline SimdDouble gmx_simdcall tanSingleAccuracy(SimdDouble x)
4319 {
4320     const SimdDouble argred0(2 * 0.78539816290140151978);
4321     const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4322     const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4323     const SimdDouble two_over_pi(2.0 / M_PI);
4324     const SimdDouble CT6(0.009498288995810566122993911);
4325     const SimdDouble CT5(0.002895755790837379295226923);
4326     const SimdDouble CT4(0.02460087336161924491836265);
4327     const SimdDouble CT3(0.05334912882656359828045988);
4328     const SimdDouble CT2(0.1333989091464957704418495);
4329     const SimdDouble CT1(0.3333307599244198227797507);
4330
4331     SimdDouble x2, p, y, z;
4332     SimdDBool  mask;
4333
4334 #        if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4335     SimdDInt32 iy;
4336     SimdDInt32 ione(1);
4337
4338     z    = x * two_over_pi;
4339     iy   = cvtR2I(z);
4340     y    = round(z);
4341     mask = cvtIB2B((iy & ione) == ione);
4342
4343     x = fnma(y, argred0, x);
4344     x = fnma(y, argred1, x);
4345     x = fnma(y, argred2, x);
4346     x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
4347 #        else
4348     const SimdDouble quarter(0.25);
4349     const SimdDouble half(0.5);
4350     const SimdDouble threequarter(0.75);
4351     SimdDouble       w, q;
4352     SimdDBool        m1, m2, m3;
4353
4354     w    = abs(x);
4355     z    = fma(w, two_over_pi, half);
4356     y    = trunc(z);
4357     q    = z * quarter;
4358     q    = q - trunc(q);
4359     m1   = (quarter <= q);
4360     m2   = (q < half);
4361     m3   = (threequarter <= q);
4362     m1   = m1 && m2;
4363     mask = m1 || m3;
4364     w    = fnma(y, argred0, w);
4365     w    = fnma(y, argred1, w);
4366     w    = fnma(y, argred2, w);
4367
4368     w = blend(w, -w, mask);
4369     x = w * copysign(SimdDouble(1.0), x);
4370 #        endif
4371     x2 = x * x;
4372     p  = fma(CT6, x2, CT5);
4373     p  = fma(p, x2, CT4);
4374     p  = fma(p, x2, CT3);
4375     p  = fma(p, x2, CT2);
4376     p  = fma(p, x2, CT1);
4377     p  = fma(x2, p * x, x);
4378
4379     p = blend(p, maskzInvSingleAccuracy(p, mask), mask);
4380     return p;
4381 }
4382
4383 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4384  *
4385  * \param x The argument to evaluate asin for
4386  * \result Asin(x)
4387  */
4388 static inline SimdDouble gmx_simdcall asinSingleAccuracy(SimdDouble x)
4389 {
4390     const SimdDouble limitlow(1e-4);
4391     const SimdDouble half(0.5);
4392     const SimdDouble one(1.0);
4393     const SimdDouble halfpi(M_PI / 2.0);
4394     const SimdDouble CC5(4.2163199048E-2);
4395     const SimdDouble CC4(2.4181311049E-2);
4396     const SimdDouble CC3(4.5470025998E-2);
4397     const SimdDouble CC2(7.4953002686E-2);
4398     const SimdDouble CC1(1.6666752422E-1);
4399     SimdDouble       xabs;
4400     SimdDouble       z, z1, z2, q, q1, q2;
4401     SimdDouble       pA, pB;
4402     SimdDBool        mask, mask2;
4403
4404     xabs  = abs(x);
4405     mask  = (half < xabs);
4406     z1    = half * (one - xabs);
4407     mask2 = (xabs < one);
4408     q1    = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
4409     q2    = xabs;
4410     z2    = q2 * q2;
4411     z     = blend(z2, z1, mask);
4412     q     = blend(q2, q1, mask);
4413
4414     z2 = z * z;
4415     pA = fma(CC5, z2, CC3);
4416     pB = fma(CC4, z2, CC2);
4417     pA = fma(pA, z2, CC1);
4418     pA = pA * z;
4419     z  = fma(pB, z2, pA);
4420     z  = fma(z, q, q);
4421     q2 = halfpi - z;
4422     q2 = q2 - z;
4423     z  = blend(z, q2, mask);
4424
4425     mask = (limitlow < xabs);
4426     z    = blend(xabs, z, mask);
4427     z    = copysign(z, x);
4428
4429     return z;
4430 }
4431
4432 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4433  *
4434  * \param x The argument to evaluate acos for
4435  * \result Acos(x)
4436  */
4437 static inline SimdDouble gmx_simdcall acosSingleAccuracy(SimdDouble x)
4438 {
4439     const SimdDouble one(1.0);
4440     const SimdDouble half(0.5);
4441     const SimdDouble pi(M_PI);
4442     const SimdDouble halfpi(M_PI / 2.0);
4443     SimdDouble       xabs;
4444     SimdDouble       z, z1, z2, z3;
4445     SimdDBool        mask1, mask2, mask3;
4446
4447     xabs  = abs(x);
4448     mask1 = (half < xabs);
4449     mask2 = (setZero() < x);
4450
4451     z     = half * (one - xabs);
4452     mask3 = (xabs < one);
4453     z     = z * maskzInvsqrtSingleAccuracy(z, mask3);
4454     z     = blend(x, z, mask1);
4455     z     = asinSingleAccuracy(z);
4456
4457     z2 = z + z;
4458     z1 = pi - z2;
4459     z3 = halfpi - z;
4460     z  = blend(z1, z2, mask2);
4461     z  = blend(z3, z, mask1);
4462
4463     return z;
4464 }
4465
4466 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4467  *
4468  * \param x The argument to evaluate atan for
4469  * \result Atan(x), same argument/value range as standard math library.
4470  */
4471 static inline SimdDouble gmx_simdcall atanSingleAccuracy(SimdDouble x)
4472 {
4473     const SimdDouble halfpi(M_PI / 2);
4474     const SimdDouble CA17(0.002823638962581753730774);
4475     const SimdDouble CA15(-0.01595690287649631500244);
4476     const SimdDouble CA13(0.04250498861074447631836);
4477     const SimdDouble CA11(-0.07489009201526641845703);
4478     const SimdDouble CA9(0.1063479334115982055664);
4479     const SimdDouble CA7(-0.1420273631811141967773);
4480     const SimdDouble CA5(0.1999269574880599975585);
4481     const SimdDouble CA3(-0.3333310186862945556640);
4482     SimdDouble       x2, x3, x4, pA, pB;
4483     SimdDBool        mask, mask2;
4484
4485     mask  = (x < setZero());
4486     x     = abs(x);
4487     mask2 = (SimdDouble(1.0) < x);
4488     x     = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
4489
4490     x2 = x * x;
4491     x3 = x2 * x;
4492     x4 = x2 * x2;
4493     pA = fma(CA17, x4, CA13);
4494     pB = fma(CA15, x4, CA11);
4495     pA = fma(pA, x4, CA9);
4496     pB = fma(pB, x4, CA7);
4497     pA = fma(pA, x4, CA5);
4498     pB = fma(pB, x4, CA3);
4499     pA = fma(pA, x2, pB);
4500     pA = fma(pA, x3, x);
4501
4502     pA = blend(pA, halfpi - pA, mask2);
4503     pA = blend(pA, -pA, mask);
4504
4505     return pA;
4506 }
4507
4508 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4509  *
4510  * \param y Y component of vector, any quartile
4511  * \param x X component of vector, any quartile
4512  * \result Atan(y,x), same argument/value range as standard math library.
4513  *
4514  * \note This routine should provide correct results for all finite
4515  * non-zero or positive-zero arguments. However, negative zero arguments will
4516  * be treated as positive zero, which means the return value will deviate from
4517  * the standard math library atan2(y,x) for those cases. That should not be
4518  * of any concern in Gromacs, and in particular it will not affect calculations
4519  * of angles from vectors.
4520  */
4521 static inline SimdDouble gmx_simdcall atan2SingleAccuracy(SimdDouble y, SimdDouble x)
4522 {
4523     const SimdDouble pi(M_PI);
4524     const SimdDouble halfpi(M_PI / 2.0);
4525     SimdDouble       xinv, p, aoffset;
4526     SimdDBool        mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
4527
4528     mask_xnz  = x != setZero();
4529     mask_ynz  = y != setZero();
4530     mask_xlt0 = (x < setZero());
4531     mask_ylt0 = (y < setZero());
4532
4533     aoffset = selectByNotMask(halfpi, mask_xnz);
4534     aoffset = selectByMask(aoffset, mask_ynz);
4535
4536     aoffset = blend(aoffset, pi, mask_xlt0);
4537     aoffset = blend(aoffset, -aoffset, mask_ylt0);
4538
4539     xinv = maskzInvSingleAccuracy(x, mask_xnz);
4540     p    = y * xinv;
4541     p    = atanSingleAccuracy(p);
4542     p    = p + aoffset;
4543
4544     return p;
4545 }
4546
4547 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4548  *
4549  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4550  * \result Correction factor to coulomb force - see below for details.
4551  *
4552  * This routine is meant to enable analytical evaluation of the
4553  * direct-space PME electrostatic force to avoid tables.
4554  *
4555  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4556  * are some problems evaluating that:
4557  *
4558  * First, the error function is difficult (read: expensive) to
4559  * approxmiate accurately for intermediate to large arguments, and
4560  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4561  * Second, we now try to avoid calculating potentials in Gromacs but
4562  * use forces directly.
4563  *
4564  * We can simply things slight by noting that the PME part is really
4565  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4566  * \f[
4567  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4568  * \f]
4569  * The first term we already have from the inverse square root, so
4570  * that we can leave out of this routine.
4571  *
4572  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4573  * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4574  * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4575  * in this range!
4576  *
4577  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4578  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4579  * then only use even powers. This is another minor optimization, since
4580  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4581  * the vector between the two atoms to get the vectorial force. The
4582  * fastest flops are the ones we can avoid calculating!
4583  *
4584  * So, here's how it should be used:
4585  *
4586  * 1. Calculate \f$r^2\f$.
4587  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4588  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4589  * 4. The return value is the expression:
4590  *
4591  * \f[
4592  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4593  * \f]
4594  *
4595  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4596  *
4597  *  \f[
4598  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4599  *  \f]
4600  *
4601  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4602  *
4603  *  \f[
4604  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4605  *  \f]
4606  *
4607  *    With a bit of math exercise you should be able to confirm that
4608  *    this is exactly
4609  *
4610  *  \f[
4611  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4612  *  \f]
4613  *
4614  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4615  *    and you have your force (divided by \f$r\f$). A final multiplication
4616  *    with the vector connecting the two particles and you have your
4617  *    vectorial force to add to the particles.
4618  *
4619  * This approximation achieves an accuracy slightly lower than 1e-6; when
4620  * added to \f$1/r\f$ the error will be insignificant.
4621  *
4622  */
4623 static inline SimdDouble gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdDouble z2)
4624 {
4625     const SimdDouble FN6(-1.7357322914161492954e-8);
4626     const SimdDouble FN5(1.4703624142580877519e-6);
4627     const SimdDouble FN4(-0.000053401640219807709149);
4628     const SimdDouble FN3(0.0010054721316683106153);
4629     const SimdDouble FN2(-0.019278317264888380590);
4630     const SimdDouble FN1(0.069670166153766424023);
4631     const SimdDouble FN0(-0.75225204789749321333);
4632
4633     const SimdDouble FD4(0.0011193462567257629232);
4634     const SimdDouble FD3(0.014866955030185295499);
4635     const SimdDouble FD2(0.11583842382862377919);
4636     const SimdDouble FD1(0.50736591960530292870);
4637     const SimdDouble FD0(1.0);
4638
4639     SimdDouble z4;
4640     SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
4641
4642     z4 = z2 * z2;
4643
4644     polyFD0 = fma(FD4, z4, FD2);
4645     polyFD1 = fma(FD3, z4, FD1);
4646     polyFD0 = fma(polyFD0, z4, FD0);
4647     polyFD0 = fma(polyFD1, z2, polyFD0);
4648
4649     polyFD0 = invSingleAccuracy(polyFD0);
4650
4651     polyFN0 = fma(FN6, z4, FN4);
4652     polyFN1 = fma(FN5, z4, FN3);
4653     polyFN0 = fma(polyFN0, z4, FN2);
4654     polyFN1 = fma(polyFN1, z4, FN1);
4655     polyFN0 = fma(polyFN0, z4, FN0);
4656     polyFN0 = fma(polyFN1, z2, polyFN0);
4657
4658     return polyFN0 * polyFD0;
4659 }
4660
4661
4662 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4663  *
4664  * \param z2 \f$(r \beta)^2\f$ - see below for details.
4665  * \result Correction factor to coulomb potential - see below for details.
4666  *
4667  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4668  * as the input argument.
4669  *
4670  * Here's how it should be used:
4671  *
4672  * 1. Calculate \f$r^2\f$.
4673  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4674  * 3. Evaluate this routine with z^2 as the argument.
4675  * 4. The return value is the expression:
4676  *
4677  *  \f[
4678  *   \frac{\mbox{erf}(z)}{z}
4679  *  \f]
4680  *
4681  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4682  *
4683  *  \f[
4684  *    \frac{\mbox{erf}(r \beta)}{r}
4685  *  \f]
4686  *
4687  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4688  *    and you have your potential.
4689  *
4690  * This approximation achieves an accuracy slightly lower than 1e-6; when
4691  * added to \f$1/r\f$ the error will be insignificant.
4692  */
4693 static inline SimdDouble gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4694 {
4695     const SimdDouble VN6(1.9296833005951166339e-8);
4696     const SimdDouble VN5(-1.4213390571557850962e-6);
4697     const SimdDouble VN4(0.000041603292906656984871);
4698     const SimdDouble VN3(-0.00013134036773265025626);
4699     const SimdDouble VN2(0.038657983986041781264);
4700     const SimdDouble VN1(0.11285044772717598220);
4701     const SimdDouble VN0(1.1283802385263030286);
4702
4703     const SimdDouble VD3(0.0066752224023576045451);
4704     const SimdDouble VD2(0.078647795836373922256);
4705     const SimdDouble VD1(0.43336185284710920150);
4706     const SimdDouble VD0(1.0);
4707
4708     SimdDouble z4;
4709     SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4710
4711     z4 = z2 * z2;
4712
4713     polyVD1 = fma(VD3, z4, VD1);
4714     polyVD0 = fma(VD2, z4, VD0);
4715     polyVD0 = fma(polyVD1, z2, polyVD0);
4716
4717     polyVD0 = invSingleAccuracy(polyVD0);
4718
4719     polyVN0 = fma(VN6, z4, VN4);
4720     polyVN1 = fma(VN5, z4, VN3);
4721     polyVN0 = fma(polyVN0, z4, VN2);
4722     polyVN1 = fma(polyVN1, z4, VN1);
4723     polyVN0 = fma(polyVN0, z4, VN0);
4724     polyVN0 = fma(polyVN1, z2, polyVN0);
4725
4726     return polyVN0 * polyVD0;
4727 }
4728
4729 #    endif
4730
4731
4732 /*! \name SIMD4 math functions
4733  *
4734  * \note Only a subset of the math functions are implemented for SIMD4.
4735  *  \{
4736  */
4737
4738
4739 #    if GMX_SIMD4_HAVE_FLOAT
4740
4741 /*************************************************************************
4742  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4743  *************************************************************************/
4744
4745 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4746  *
4747  * This is a low-level routine that should only be used by SIMD math routine
4748  * that evaluates the inverse square root.
4749  *
4750  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4751  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4752  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4753  */
4754 static inline Simd4Float gmx_simdcall rsqrtIter(Simd4Float lu, Simd4Float x)
4755 {
4756     Simd4Float tmp1 = x * lu;
4757     Simd4Float tmp2 = Simd4Float(-0.5F) * lu;
4758     tmp1            = fma(tmp1, lu, Simd4Float(-3.0F));
4759     return tmp1 * tmp2;
4760 }
4761
4762 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4763  *
4764  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4765  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4766  *           For the single precision implementation this is obviously always
4767  *           true for positive values, but for double precision it adds an
4768  *           extra restriction since the first lookup step might have to be
4769  *           performed in single precision on some architectures. Note that the
4770  *           responsibility for checking falls on you - this routine does not
4771  *           check arguments.
4772  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4773  */
4774 static inline Simd4Float gmx_simdcall invsqrt(Simd4Float x)
4775 {
4776     Simd4Float lu = rsqrt(x);
4777 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4778     lu = rsqrtIter(lu, x);
4779 #        endif
4780 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4781     lu = rsqrtIter(lu, x);
4782 #        endif
4783 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4784     lu = rsqrtIter(lu, x);
4785 #        endif
4786     return lu;
4787 }
4788
4789
4790 #    endif // GMX_SIMD4_HAVE_FLOAT
4791
4792
4793 #    if GMX_SIMD4_HAVE_DOUBLE
4794 /*************************************************************************
4795  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4796  *************************************************************************/
4797
4798 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4799  *
4800  * This is a low-level routine that should only be used by SIMD math routine
4801  * that evaluates the inverse square root.
4802  *
4803  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4804  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
4805  *  \return   An improved approximation with roughly twice as many bits of accuracy.
4806  */
4807 static inline Simd4Double gmx_simdcall rsqrtIter(Simd4Double lu, Simd4Double x)
4808 {
4809     Simd4Double tmp1 = x * lu;
4810     Simd4Double tmp2 = Simd4Double(-0.5F) * lu;
4811     tmp1             = fma(tmp1, lu, Simd4Double(-3.0F));
4812     return tmp1 * tmp2;
4813 }
4814
4815 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4816  *
4817  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4818  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4819  *           For the single precision implementation this is obviously always
4820  *           true for positive values, but for double precision it adds an
4821  *           extra restriction since the first lookup step might have to be
4822  *           performed in single precision on some architectures. Note that the
4823  *           responsibility for checking falls on you - this routine does not
4824  *           check arguments.
4825  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4826  */
4827 static inline Simd4Double gmx_simdcall invsqrt(Simd4Double x)
4828 {
4829     Simd4Double lu = rsqrt(x);
4830 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4831     lu = rsqrtIter(lu, x);
4832 #        endif
4833 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4834     lu = rsqrtIter(lu, x);
4835 #        endif
4836 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4837     lu = rsqrtIter(lu, x);
4838 #        endif
4839 #        if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4840     lu = rsqrtIter(lu, x);
4841 #        endif
4842     return lu;
4843 }
4844
4845
4846 /**********************************************************************
4847  * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4848  **********************************************************************/
4849
4850 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4851  *
4852  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4853  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4854  *           For the single precision implementation this is obviously always
4855  *           true for positive values, but for double precision it adds an
4856  *           extra restriction since the first lookup step might have to be
4857  *           performed in single precision on some architectures. Note that the
4858  *           responsibility for checking falls on you - this routine does not
4859  *           check arguments.
4860  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4861  */
4862 static inline Simd4Double gmx_simdcall invsqrtSingleAccuracy(Simd4Double x)
4863 {
4864     Simd4Double lu = rsqrt(x);
4865 #        if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4866     lu = rsqrtIter(lu, x);
4867 #        endif
4868 #        if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4869     lu = rsqrtIter(lu, x);
4870 #        endif
4871 #        if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4872     lu = rsqrtIter(lu, x);
4873 #        endif
4874     return lu;
4875 }
4876
4877
4878 #    endif // GMX_SIMD4_HAVE_DOUBLE
4879
4880 /*! \} */
4881
4882 #    if GMX_SIMD_HAVE_FLOAT
4883 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4884  *
4885  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4886  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4887  *           For the single precision implementation this is obviously always
4888  *           true for positive values, but for double precision it adds an
4889  *           extra restriction since the first lookup step might have to be
4890  *           performed in single precision on some architectures. Note that the
4891  *           responsibility for checking falls on you - this routine does not
4892  *           check arguments.
4893  *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
4894  */
4895 static inline SimdFloat gmx_simdcall invsqrtSingleAccuracy(SimdFloat x)
4896 {
4897     return invsqrt(x);
4898 }
4899
4900 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4901  *
4902  *  This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4903  *  Illegal values in the masked-out elements will not lead to
4904  *  floating-point exceptions.
4905  *
4906  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4907  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4908  *           For the single precision implementation this is obviously always
4909  *           true for positive values, but for double precision it adds an
4910  *           extra restriction since the first lookup step might have to be
4911  *           performed in single precision on some architectures. Note that the
4912  *           responsibility for checking falls on you - this routine does not
4913  *           check arguments.
4914  *  \param m Mask
4915  *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
4916  *           entry was not masked, and 0.0 for masked-out entries.
4917  */
4918 static inline SimdFloat maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4919 {
4920     return maskzInvsqrt(x, m);
4921 }
4922
4923 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4924  *
4925  * \param x0  First set of arguments, x0 must be in single range (see below).
4926  * \param x1  Second set of arguments, x1 must be in single range (see below).
4927  * \param[out] out0  Result 1/sqrt(x0)
4928  * \param[out] out1  Result 1/sqrt(x1)
4929  *
4930  *  In particular for double precision we can sometimes calculate square root
4931  *  pairs slightly faster by using single precision until the very last step.
4932  *
4933  * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4934  *       GMX_FLOAT_MAX, i.e. within the range of single precision.
4935  *       For the single precision implementation this is obviously always
4936  *       true for positive values, but for double precision it adds an
4937  *       extra restriction since the first lookup step might have to be
4938  *       performed in single precision on some architectures. Note that the
4939  *       responsibility for checking falls on you - this routine does not
4940  *       check arguments.
4941  */
4942 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
4943 {
4944     return invsqrtPair(x0, x1, out0, out1);
4945 }
4946
4947 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4948  *
4949  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4950  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4951  *           For the single precision implementation this is obviously always
4952  *           true for positive values, but for double precision it adds an
4953  *           extra restriction since the first lookup step might have to be
4954  *           performed in single precision on some architectures. Note that the
4955  *           responsibility for checking falls on you - this routine does not
4956  *           check arguments.
4957  *  \return  1/x. Result is undefined if your argument was invalid.
4958  */
4959 static inline SimdFloat gmx_simdcall invSingleAccuracy(SimdFloat x)
4960 {
4961     return inv(x);
4962 }
4963
4964
4965 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4966  *
4967  *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4968  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
4969  *           For the single precision implementation this is obviously always
4970  *           true for positive values, but for double precision it adds an
4971  *           extra restriction since the first lookup step might have to be
4972  *           performed in single precision on some architectures. Note that the
4973  *           responsibility for checking falls on you - this routine does not
4974  *           check arguments.
4975  *  \param m Mask
4976  *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
4977  */
4978 static inline SimdFloat maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4979 {
4980     return maskzInv(x, m);
4981 }
4982
4983 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4984  *
4985  * \copydetails sqrt(SimdFloat)
4986  */
4987 template<MathOptimization opt = MathOptimization::Safe>
4988 static inline SimdFloat gmx_simdcall sqrtSingleAccuracy(SimdFloat x)
4989 {
4990     return sqrt<opt>(x);
4991 }
4992
4993 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
4994  *
4995  * \copydetails cbrt(SimdFloat)
4996  */
4997 static inline SimdFloat gmx_simdcall cbrtSingleAccuracy(SimdFloat x)
4998 {
4999     return cbrt(x);
5000 }
5001
5002 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5003  *
5004  * \copydetails cbrt(SimdFloat)
5005  */
5006 static inline SimdFloat gmx_simdcall invcbrtSingleAccuracy(SimdFloat x)
5007 {
5008     return invcbrt(x);
5009 }
5010
5011 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5012  *
5013  * \param x Argument, should be >0.
5014  * \result The base-2 logarithm of x. Undefined if argument is invalid.
5015  */
5016 static inline SimdFloat gmx_simdcall log2SingleAccuracy(SimdFloat x)
5017 {
5018     return log2(x);
5019 }
5020
5021 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5022  *
5023  * \param x Argument, should be >0.
5024  * \result The natural logarithm of x. Undefined if argument is invalid.
5025  */
5026 static inline SimdFloat gmx_simdcall logSingleAccuracy(SimdFloat x)
5027 {
5028     return log(x);
5029 }
5030
5031 /*! \brief SIMD float 2^x, only targeting single accuracy.
5032  *
5033  * \copydetails exp2(SimdFloat)
5034  */
5035 template<MathOptimization opt = MathOptimization::Safe>
5036 static inline SimdFloat gmx_simdcall exp2SingleAccuracy(SimdFloat x)
5037 {
5038     return exp2<opt>(x);
5039 }
5040
5041 /*! \brief SIMD float e^x, only targeting single accuracy.
5042  *
5043  * \copydetails exp(SimdFloat)
5044  */
5045 template<MathOptimization opt = MathOptimization::Safe>
5046 static inline SimdFloat gmx_simdcall expSingleAccuracy(SimdFloat x)
5047 {
5048     return exp<opt>(x);
5049 }
5050
5051 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5052  *
5053  * \copydetails pow(SimdFloat,SimdFloat)
5054  */
5055 template<MathOptimization opt = MathOptimization::Safe>
5056 static inline SimdFloat gmx_simdcall powSingleAccuracy(SimdFloat x, SimdFloat y)
5057 {
5058     return pow<opt>(x, y);
5059 }
5060
5061 /*! \brief SIMD float erf(x), only targeting single accuracy.
5062  *
5063  * \param x The value to calculate erf(x) for.
5064  * \result erf(x)
5065  *
5066  * This routine achieves very close to single precision, but we do not care about
5067  * the last bit or the subnormal result range.
5068  */
5069 static inline SimdFloat gmx_simdcall erfSingleAccuracy(SimdFloat x)
5070 {
5071     return erf(x);
5072 }
5073
5074 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5075  *
5076  * \param x The value to calculate erfc(x) for.
5077  * \result erfc(x)
5078  *
5079  * This routine achieves singleprecision (bar the last bit) over most of the
5080  * input range, but for large arguments where the result is getting close
5081  * to the minimum representable numbers we accept slightly larger errors
5082  * (think results that are in the ballpark of 10^-30) since that is not
5083  * relevant for MD.
5084  */
5085 static inline SimdFloat gmx_simdcall erfcSingleAccuracy(SimdFloat x)
5086 {
5087     return erfc(x);
5088 }
5089
5090 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5091  *
5092  * \param x The argument to evaluate sin/cos for
5093  * \param[out] sinval Sin(x)
5094  * \param[out] cosval Cos(x)
5095  */
5096 static inline void gmx_simdcall sinCosSingleAccuracy(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
5097 {
5098     sincos(x, sinval, cosval);
5099 }
5100
5101 /*! \brief SIMD float sin(x), only targeting single accuracy.
5102  *
5103  * \param x The argument to evaluate sin for
5104  * \result Sin(x)
5105  *
5106  * \attention Do NOT call both sin & cos if you need both results, since each of them
5107  * will then call \ref sincos and waste a factor 2 in performance.
5108  */
5109 static inline SimdFloat gmx_simdcall sinSingleAccuracy(SimdFloat x)
5110 {
5111     return sin(x);
5112 }
5113
5114 /*! \brief SIMD float cos(x), only targeting single accuracy.
5115  *
5116  * \param x The argument to evaluate cos for
5117  * \result Cos(x)
5118  *
5119  * \attention Do NOT call both sin & cos if you need both results, since each of them
5120  * will then call \ref sincos and waste a factor 2 in performance.
5121  */
5122 static inline SimdFloat gmx_simdcall cosSingleAccuracy(SimdFloat x)
5123 {
5124     return cos(x);
5125 }
5126
5127 /*! \brief SIMD float tan(x), only targeting single accuracy.
5128  *
5129  * \param x The argument to evaluate tan for
5130  * \result Tan(x)
5131  */
5132 static inline SimdFloat gmx_simdcall tanSingleAccuracy(SimdFloat x)
5133 {
5134     return tan(x);
5135 }
5136
5137 /*! \brief SIMD float asin(x), only targeting single accuracy.
5138  *
5139  * \param x The argument to evaluate asin for
5140  * \result Asin(x)
5141  */
5142 static inline SimdFloat gmx_simdcall asinSingleAccuracy(SimdFloat x)
5143 {
5144     return asin(x);
5145 }
5146
5147 /*! \brief SIMD float acos(x), only targeting single accuracy.
5148  *
5149  * \param x The argument to evaluate acos for
5150  * \result Acos(x)
5151  */
5152 static inline SimdFloat gmx_simdcall acosSingleAccuracy(SimdFloat x)
5153 {
5154     return acos(x);
5155 }
5156
5157 /*! \brief SIMD float atan(x), only targeting single accuracy.
5158  *
5159  * \param x The argument to evaluate atan for
5160  * \result Atan(x), same argument/value range as standard math library.
5161  */
5162 static inline SimdFloat gmx_simdcall atanSingleAccuracy(SimdFloat x)
5163 {
5164     return atan(x);
5165 }
5166
5167 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5168  *
5169  * \param y Y component of vector, any quartile
5170  * \param x X component of vector, any quartile
5171  * \result Atan(y,x), same argument/value range as standard math library.
5172  *
5173  * \note This routine should provide correct results for all finite
5174  * non-zero or positive-zero arguments. However, negative zero arguments will
5175  * be treated as positive zero, which means the return value will deviate from
5176  * the standard math library atan2(y,x) for those cases. That should not be
5177  * of any concern in Gromacs, and in particular it will not affect calculations
5178  * of angles from vectors.
5179  */
5180 static inline SimdFloat gmx_simdcall atan2SingleAccuracy(SimdFloat y, SimdFloat x)
5181 {
5182     return atan2(y, x);
5183 }
5184
5185 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5186  *
5187  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5188  * \result Correction factor to coulomb force.
5189  */
5190 static inline SimdFloat gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdFloat z2)
5191 {
5192     return pmeForceCorrection(z2);
5193 }
5194
5195 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5196  *
5197  * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5198  * \result Correction factor to coulomb force.
5199  */
5200 static inline SimdFloat gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
5201 {
5202     return pmePotentialCorrection(z2);
5203 }
5204 #    endif // GMX_SIMD_HAVE_FLOAT
5205
5206 #    if GMX_SIMD4_HAVE_FLOAT
5207 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5208  *
5209  *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5210  *           GMX_FLOAT_MAX, i.e. within the range of single precision.
5211  *           For the single precision implementation this is obviously always
5212  *           true for positive values, but for double precision it adds an
5213  *           extra restriction since the first lookup step might have to be
5214  *           performed in single precision on some architectures. Note that the
5215  *           responsibility for checking falls on you - this routine does not
5216  *           check arguments.
5217  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
5218  */
5219 static inline Simd4Float gmx_simdcall invsqrtSingleAccuracy(Simd4Float x)
5220 {
5221     return invsqrt(x);
5222 }
5223 #    endif // GMX_SIMD4_HAVE_FLOAT
5224
5225 /*! \}   end of addtogroup module_simd */
5226 /*! \endcond  end of condition libabl */
5227
5228 #endif // GMX_SIMD
5229
5230 } // namespace gmx
5231
5232 #endif // GMX_SIMD_SIMD_MATH_H