src/gromacs/simd/simd_math.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_SIMD_MATH_H_
  36 #define GMX_SIMD_SIMD_MATH_H_
  37
  38 /*! \libinternal \file
  39  *
  40  * \brief Math functions for SIMD datatypes.
  41  *
  42  * \attention This file is generic for all SIMD architectures, so you cannot
  43  * assume that any of the optional SIMD features (as defined in simd.h) are
  44  * present. In particular, this means you cannot assume support for integers,
  45  * logical operations (neither on floating-point nor integer values), shifts,
  46  * and the architecture might only have SIMD for either float or double.
  47  * Second, to keep this file clean and general, any additions to this file
  48  * must work for all possible SIMD architectures in both single and double
  49  * precision (if they support it), and you cannot make any assumptions about
  50  * SIMD width.
  51  *
  52  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  53  *
  54  * \inlibraryapi
  55  * \ingroup module_simd
  56  */
  57
  58 #include <math.h>
  59
  60 #include "gromacs/math/utilities.h"
  61 #include "gromacs/simd/simd.h"
  62
  63 #include "config.h"
  64
  65 /*! \cond libapi */
  66 /*! \addtogroup module_simd */
  67 /*! \{ */
  68
  69 /*! \name Implementation accuracy settings
  70  *  \{
  71  */
  72
  73 /*! \brief We accept lsb errors for 1/sqrt(x) and 1/x, so float target is 22 bits */
  74 #define GMX_SIMD_MATH_TARGET_SINGLE_BITS 22
  75
  76 /*! \brief We accept "double" that has 2x single precision - 44 bits.
  77  *
  78  * This way two Newton-Raphson iterations will suffice in double precision.
  79  */
  80 #define GMX_SIMD_MATH_TARGET_DOUBLE_BITS 44
  81
  82 /*! \} */
  83
  84 #ifdef GMX_SIMD_HAVE_FLOAT
  85
  86 /*! \name Single precision SIMD math functions
  87  *
  88  *  \note In most cases you should use the real-precision functions instead.
  89  *  \{
  90  */
  91
  92 /****************************************
  93  * SINGLE PRECISION SIMD MATH FUNCTIONS *
  94  ****************************************/
  95
  96 /*! \brief SIMD float utility to sum a+b+c+d.
  97  *
  98  * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
  99  *
 100  * \param a term 1 (multiple values)
 101  * \param b term 2 (multiple values)
 102  * \param c term 3 (multiple values)
 103  * \param d term 4 (multiple values)
 104  * \return sum of terms 1-4 (multiple values)
 105  */
 106 static gmx_inline gmx_simd_float_t gmx_simdcall
 107 gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
 108                 gmx_simd_float_t c, gmx_simd_float_t d)
 109 {
 110     return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
 111 }
 112
 113 /*! \brief Return -a if b is negative, SIMD float.
 114  *
 115  * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
 116  *
 117  * \param a Values to set sign for
 118  * \param b Values used to set sign
 119  * \return if b is negative, the sign of a will be changed.
 120  *
 121  * This is equivalent to doing an xor operation on a with the sign bit of b,
 122  * with the exception that negative zero is not considered to be negative
 123  * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
 124  */
 125 static gmx_inline gmx_simd_float_t gmx_simdcall
 126 gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
 127 {
 128 #ifdef GMX_SIMD_HAVE_LOGICAL
 129     return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), b));
 130 #else
 131     return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
 132 #endif
 133 }
 134
 135 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
 136  *
 137  * This is a low-level routine that should only be used by SIMD math routine
 138  * that evaluates the inverse square root.
 139  *
 140  *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
 141  *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
 142  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 143  */
 144 static gmx_inline gmx_simd_float_t gmx_simdcall
 145 gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
 146 {
 147 #    ifdef GMX_SIMD_HAVE_FMA
 148     return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
 149 #    else
 150     return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
 151 #    endif
 152 }
 153
 154 /*! \brief Calculate 1/sqrt(x) for SIMD float.
 155  *
 156  * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
 157  *
 158  *  \param x Argument that must be >0. This routine does not check arguments.
 159  *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
 160  */
 161 static gmx_inline gmx_simd_float_t gmx_simdcall
 162 gmx_simd_invsqrt_f(gmx_simd_float_t x)
 163 {
 164     gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
 165 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 166     lu = gmx_simd_rsqrt_iter_f(lu, x);
 167 #endif
 168 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 169     lu = gmx_simd_rsqrt_iter_f(lu, x);
 170 #endif
 171 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 172     lu = gmx_simd_rsqrt_iter_f(lu, x);
 173 #endif
 174     return lu;
 175 }
 176
 177 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
 178  *
 179  * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
 180  *
 181  * \param x0  First set of arguments, x0 must be positive - no argument checking.
 182  * \param x1  Second set of arguments, x1 must be positive - no argument checking.
 183  * \param[out] out0  Result 1/sqrt(x0)
 184  * \param[out] out1  Result 1/sqrt(x1)
 185  *
 186  *  In particular for double precision we can sometimes calculate square root
 187  *  pairs slightly faster by using single precision until the very last step.
 188  */
 189 static gmx_inline void gmx_simdcall
 190 gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
 191                         gmx_simd_float_t *out0, gmx_simd_float_t *out1)
 192 {
 193     *out0 = gmx_simd_invsqrt_f(x0);
 194     *out1 = gmx_simd_invsqrt_f(x1);
 195 }
 196
 197 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
 198  *
 199  * This is a low-level routine that should only be used by SIMD math routine
 200  * that evaluates the reciprocal.
 201  *
 202  *  \param lu Approximation of 1/x, typically obtained from lookup.
 203  *  \param x  The reference (starting) value x for which we want 1/x.
 204  *  \return   An improved approximation with roughly twice as many bits of accuracy.
 205  */
 206 static gmx_inline gmx_simd_float_t gmx_simdcall
 207 gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
 208 {
 209     return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
 210 }
 211
 212 /*! \brief Calculate 1/x for SIMD float.
 213  *
 214  * You should normally call the real-precision routine \ref gmx_simd_inv_r.
 215  *
 216  *  \param x Argument that must be nonzero. This routine does not check arguments.
 217  *  \return 1/x. Result is undefined if your argument was invalid.
 218  */
 219 static gmx_inline gmx_simd_float_t gmx_simdcall
 220 gmx_simd_inv_f(gmx_simd_float_t x)
 221 {
 222     gmx_simd_float_t lu = gmx_simd_rcp_f(x);
 223 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 224     lu = gmx_simd_rcp_iter_f(lu, x);
 225 #endif
 226 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 227     lu = gmx_simd_rcp_iter_f(lu, x);
 228 #endif
 229 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
 230     lu = gmx_simd_rcp_iter_f(lu, x);
 231 #endif
 232     return lu;
 233 }
 234
 235 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
 236  *
 237  * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
 238  *
 239  *  \param x Argument that must be >=0.
 240  *  \return sqrt(x). If x=0, the result will correctly be set to 0.
 241  *          The result is undefined if the input value is negative.
 242  */
 243 static gmx_inline gmx_simd_float_t gmx_simdcall
 244 gmx_simd_sqrt_f(gmx_simd_float_t x)
 245 {
 246     gmx_simd_fbool_t  mask;
 247     gmx_simd_float_t  res;
 248
 249     mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
 250     res  = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_f(x), mask);
 251     return gmx_simd_mul_f(res, x);
 252 }
 253
 254 /*! \brief SIMD float log(x). This is the natural logarithm.
 255  *
 256  * You should normally call the real-precision routine \ref gmx_simd_log_r.
 257  *
 258  * \param x Argument, should be >0.
 259  * \result The natural logarithm of x. Undefined if argument is invalid.
 260  */
 261 #ifndef gmx_simd_log_f
 262 static gmx_inline gmx_simd_float_t gmx_simdcall
 263 gmx_simd_log_f(gmx_simd_float_t x)
 264 {
 265     const gmx_simd_float_t  half       = gmx_simd_set1_f(0.5f);
 266     const gmx_simd_float_t  one        = gmx_simd_set1_f(1.0f);
 267     const gmx_simd_float_t  sqrt2      = gmx_simd_set1_f(sqrt(2.0f));
 268     const gmx_simd_float_t  corr       = gmx_simd_set1_f(0.693147180559945286226764f);
 269     const gmx_simd_float_t  CL9        = gmx_simd_set1_f(0.2371599674224853515625f);
 270     const gmx_simd_float_t  CL7        = gmx_simd_set1_f(0.285279005765914916992188f);
 271     const gmx_simd_float_t  CL5        = gmx_simd_set1_f(0.400005519390106201171875f);
 272     const gmx_simd_float_t  CL3        = gmx_simd_set1_f(0.666666567325592041015625f);
 273     const gmx_simd_float_t  CL1        = gmx_simd_set1_f(2.0f);
 274     gmx_simd_float_t        fexp, x2, p;
 275     gmx_simd_fbool_t        mask;
 276
 277     fexp  = gmx_simd_get_exponent_f(x);
 278     x     = gmx_simd_get_mantissa_f(x);
 279
 280     mask  = gmx_simd_cmplt_f(sqrt2, x);
 281     /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
 282     fexp  = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
 283     x     = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
 284
 285     x     = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
 286     x2    = gmx_simd_mul_f(x, x);
 287
 288     p     = gmx_simd_fmadd_f(CL9, x2, CL7);
 289     p     = gmx_simd_fmadd_f(p, x2, CL5);
 290     p     = gmx_simd_fmadd_f(p, x2, CL3);
 291     p     = gmx_simd_fmadd_f(p, x2, CL1);
 292     p     = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
 293
 294     return p;
 295 }
 296 #endif
 297
 298 #ifndef gmx_simd_exp2_f
 299 /*! \brief SIMD float 2^x.
 300  *
 301  * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
 302  *
 303  * \param x Argument.
 304  * \result 2^x. Undefined if input argument caused overflow.
 305  */
 306 static gmx_inline gmx_simd_float_t gmx_simdcall
 307 gmx_simd_exp2_f(gmx_simd_float_t x)
 308 {
 309     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 310     const gmx_simd_float_t  arglimit = gmx_simd_set1_f(126.0f);
 311     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.0001534581200287996416911311);
 312     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(0.001339993121934088894618990);
 313     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.009618488957115180159497841);
 314     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(0.05550328776964726865751735);
 315     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(0.2402264689063408646490722);
 316     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(0.6931472057372680777553816);
 317     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 318
 319     gmx_simd_float_t        fexppart;
 320     gmx_simd_float_t        intpart;
 321     gmx_simd_float_t        p;
 322     gmx_simd_fbool_t        valuemask;
 323
 324     fexppart  = gmx_simd_set_exponent_f(x);
 325     intpart   = gmx_simd_round_f(x);
 326     valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
 327     fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
 328     x         = gmx_simd_sub_f(x, intpart);
 329
 330     p         = gmx_simd_fmadd_f(CC6, x, CC5);
 331     p         = gmx_simd_fmadd_f(p, x, CC4);
 332     p         = gmx_simd_fmadd_f(p, x, CC3);
 333     p         = gmx_simd_fmadd_f(p, x, CC2);
 334     p         = gmx_simd_fmadd_f(p, x, CC1);
 335     p         = gmx_simd_fmadd_f(p, x, one);
 336     x         = gmx_simd_mul_f(p, fexppart);
 337     return x;
 338 }
 339 #endif
 340
 341 #ifndef gmx_simd_exp_f
 342 /*! \brief SIMD float exp(x).
 343  *
 344  * You should normally call the real-precision routine \ref gmx_simd_exp_r.
 345  *
 346  * In addition to scaling the argument for 2^x this routine correctly does
 347  * extended precision arithmetics to improve accuracy.
 348  *
 349  * \param x Argument.
 350  * \result exp(x). Undefined if input argument caused overflow,
 351  * which can happen if abs(x) \> 7e13.
 352  */
 353 static gmx_inline gmx_simd_float_t gmx_simdcall
 354 gmx_simd_exp_f(gmx_simd_float_t x)
 355 {
 356     const gmx_simd_float_t  argscale     = gmx_simd_set1_f(1.44269504088896341f);
 357     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 358     const gmx_simd_float_t  arglimit     = gmx_simd_set1_f(126.0f);
 359     const gmx_simd_float_t  invargscale0 = gmx_simd_set1_f(0.693145751953125f);
 360     const gmx_simd_float_t  invargscale1 = gmx_simd_set1_f(1.428606765330187045e-06f);
 361     const gmx_simd_float_t  CC4          = gmx_simd_set1_f(0.00136324646882712841033936f);
 362     const gmx_simd_float_t  CC3          = gmx_simd_set1_f(0.00836596917361021041870117f);
 363     const gmx_simd_float_t  CC2          = gmx_simd_set1_f(0.0416710823774337768554688f);
 364     const gmx_simd_float_t  CC1          = gmx_simd_set1_f(0.166665524244308471679688f);
 365     const gmx_simd_float_t  CC0          = gmx_simd_set1_f(0.499999850988388061523438f);
 366     const gmx_simd_float_t  one          = gmx_simd_set1_f(1.0f);
 367     gmx_simd_float_t        fexppart;
 368     gmx_simd_float_t        intpart;
 369     gmx_simd_float_t        y, p;
 370     gmx_simd_fbool_t        valuemask;
 371
 372     y         = gmx_simd_mul_f(x, argscale);
 373     fexppart  = gmx_simd_set_exponent_f(y);  /* rounds to nearest int internally */
 374     intpart   = gmx_simd_round_f(y);         /* use same rounding algorithm here */
 375     valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
 376     fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
 377
 378     /* Extended precision arithmetics */
 379     x         = gmx_simd_fnmadd_f(invargscale0, intpart, x);
 380     x         = gmx_simd_fnmadd_f(invargscale1, intpart, x);
 381
 382     p         = gmx_simd_fmadd_f(CC4, x, CC3);
 383     p         = gmx_simd_fmadd_f(p, x, CC2);
 384     p         = gmx_simd_fmadd_f(p, x, CC1);
 385     p         = gmx_simd_fmadd_f(p, x, CC0);
 386     p         = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
 387     p         = gmx_simd_add_f(p, one);
 388     x         = gmx_simd_mul_f(p, fexppart);
 389     return x;
 390 }
 391 #endif
 392
 393 /*! \brief SIMD float erf(x).
 394  *
 395  * You should normally call the real-precision routine \ref gmx_simd_erf_r.
 396  *
 397  * \param x The value to calculate erf(x) for.
 398  * \result erf(x)
 399  *
 400  * This routine achieves very close to full precision, but we do not care about
 401  * the last bit or the subnormal result range.
 402  */
 403 static gmx_inline gmx_simd_float_t gmx_simdcall
 404 gmx_simd_erf_f(gmx_simd_float_t x)
 405 {
 406     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 407     const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
 408     const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
 409     const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
 410     const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
 411     const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
 412     const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
 413     const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
 414     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 415     const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
 416     const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
 417     const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
 418     const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
 419     const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
 420     const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
 421     const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
 422     const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
 423     const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
 424     const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
 425     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 426     const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
 427     const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
 428     const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
 429     const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
 430     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
 431     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
 432     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
 433     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
 434     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
 435     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
 436     const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
 437     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 438     const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
 439
 440     gmx_simd_float_t        x2, x4, y;
 441     gmx_simd_float_t        t, t2, w, w2;
 442     gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
 443     gmx_simd_float_t        expmx2;
 444     gmx_simd_float_t        res_erf, res_erfc, res;
 445     gmx_simd_fbool_t        mask;
 446
 447     /* Calculate erf() */
 448     x2   = gmx_simd_mul_f(x, x);
 449     x4   = gmx_simd_mul_f(x2, x2);
 450
 451     pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
 452     pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
 453     pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
 454     pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
 455     pA0  = gmx_simd_mul_f(pA0, x4);
 456     pA0  = gmx_simd_fmadd_f(pA1, x2, pA0);
 457     /* Constant term must come last for precision reasons */
 458     pA0  = gmx_simd_add_f(pA0, CA0);
 459
 460     res_erf = gmx_simd_mul_f(x, pA0);
 461
 462     /* Calculate erfc */
 463     y       = gmx_simd_fabs_f(x);
 464     t       = gmx_simd_inv_f(y);
 465     w       = gmx_simd_sub_f(t, one);
 466     t2      = gmx_simd_mul_f(t, t);
 467     w2      = gmx_simd_mul_f(w, w);
 468
 469     /* No need for a floating-point sieve here (as in erfc), since erf()
 470      * will never return values that are extremely small for large args.
 471      */
 472     expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
 473
 474     pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
 475     pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
 476     pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
 477     pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
 478     pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
 479     pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
 480     pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
 481     pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
 482     pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
 483
 484     pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
 485     pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
 486     pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
 487     pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
 488     pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
 489     pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
 490     pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
 491     pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
 492
 493     pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
 494     pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
 495     pC0  = gmx_simd_mul_f(pC0, t);
 496
 497     /* SELECT pB0 or pC0 for erfc() */
 498     mask     = gmx_simd_cmplt_f(two, y);
 499     res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
 500     res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
 501
 502     /* erfc(x<0) = 2-erfc(|x|) */
 503     mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
 504     res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
 505
 506     /* Select erf() or erfc() */
 507     mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
 508     res  = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, mask);
 509
 510     return res;
 511 }
 512
 513 /*! \brief SIMD float erfc(x).
 514  *
 515  * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
 516  *
 517  * \param x The value to calculate erfc(x) for.
 518  * \result erfc(x)
 519  *
 520  * This routine achieves full precision (bar the last bit) over most of the
 521  * input range, but for large arguments where the result is getting close
 522  * to the minimum representable numbers we accept slightly larger errors
 523  * (think results that are in the ballpark of 10^-30 for single precision,
 524  * or 10^-200 for double) since that is not relevant for MD.
 525  */
 526 static gmx_inline gmx_simd_float_t gmx_simdcall
 527 gmx_simd_erfc_f(gmx_simd_float_t x)
 528 {
 529     /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 530     const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
 531     const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
 532     const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
 533     const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
 534     const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
 535     const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
 536     const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
 537     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 538     const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
 539     const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
 540     const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
 541     const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
 542     const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
 543     const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
 544     const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
 545     const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
 546     const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
 547     const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
 548     /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 549     const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
 550     const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
 551     const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
 552     const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
 553     const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
 554     const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
 555     const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
 556     const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
 557     const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
 558     const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
 559     const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
 560     /* Coefficients for expansion of exp(x) in [0,0.1] */
 561     /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 562     const gmx_simd_float_t  CD2      = gmx_simd_set1_f(0.5000066608081202f);
 563     const gmx_simd_float_t  CD3      = gmx_simd_set1_f(0.1664795422874624f);
 564     const gmx_simd_float_t  CD4      = gmx_simd_set1_f(0.04379839977652482f);
 565     const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
 566     const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
 567
 568     /* We need to use a small trick here, since we cannot assume all SIMD
 569      * architectures support integers, and the flag we want (0xfffff000) would
 570      * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
 571      * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
 572      * fp numbers, and perform a logical or. Since the expression is constant,
 573      * we can at least hope it is evaluated at compile-time.
 574      */
 575 #ifdef GMX_SIMD_HAVE_LOGICAL
 576     const gmx_simd_float_t  sieve    = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
 577 #else
 578     const int               isieve   = 0xFFFFF000;
 579     float                   mem[GMX_SIMD_REAL_WIDTH*2];
 580     float *                 pmem = gmx_simd_align_f(mem);
 581     union {
 582         float f; int i;
 583     } conv;
 584     int                     i;
 585 #endif
 586
 587     gmx_simd_float_t        x2, x4, y;
 588     gmx_simd_float_t        q, z, t, t2, w, w2;
 589     gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
 590     gmx_simd_float_t        expmx2, corr;
 591     gmx_simd_float_t        res_erf, res_erfc, res;
 592     gmx_simd_fbool_t        mask;
 593
 594     /* Calculate erf() */
 595     x2     = gmx_simd_mul_f(x, x);
 596     x4     = gmx_simd_mul_f(x2, x2);
 597
 598     pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
 599     pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
 600     pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
 601     pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
 602     pA1  = gmx_simd_mul_f(pA1, x2);
 603     pA0  = gmx_simd_fmadd_f(pA0, x4, pA1);
 604     /* Constant term must come last for precision reasons */
 605     pA0  = gmx_simd_add_f(pA0, CA0);
 606
 607     res_erf = gmx_simd_mul_f(x, pA0);
 608
 609     /* Calculate erfc */
 610     y       = gmx_simd_fabs_f(x);
 611     t       = gmx_simd_inv_f(y);
 612     w       = gmx_simd_sub_f(t, one);
 613     t2      = gmx_simd_mul_f(t, t);
 614     w2      = gmx_simd_mul_f(w, w);
 615     /*
 616      * We cannot simply calculate exp(-y2) directly in single precision, since
 617      * that will lose a couple of bits of precision due to the multiplication.
 618      * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
 619      * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
 620      *
 621      * The only drawback with this is that it requires TWO separate exponential
 622      * evaluations, which would be horrible performance-wise. However, the argument
 623      * for the second exp() call is always small, so there we simply use a
 624      * low-order minimax expansion on [0,0.1].
 625      *
 626      * However, this neat idea requires support for logical ops (and) on
 627      * FP numbers, which some vendors decided isn't necessary in their SIMD
 628      * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
 629      * in double, but we still need memory as a backup when that is not available,
 630      * and this case is rare enough that we go directly there...
 631      */
 632 #ifdef GMX_SIMD_HAVE_LOGICAL
 633     z       = gmx_simd_and_f(y, sieve);
 634 #else
 635     gmx_simd_store_f(pmem, y);
 636     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 637     {
 638         conv.f  = pmem[i];
 639         conv.i  = conv.i & isieve;
 640         pmem[i] = conv.f;
 641     }
 642     z = gmx_simd_load_f(pmem);
 643 #endif
 644     q       = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
 645     corr    = gmx_simd_fmadd_f(CD4, q, CD3);
 646     corr    = gmx_simd_fmadd_f(corr, q, CD2);
 647     corr    = gmx_simd_fmadd_f(corr, q, one);
 648     corr    = gmx_simd_fmadd_f(corr, q, one);
 649
 650     expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
 651     expmx2  = gmx_simd_mul_f(expmx2, corr);
 652
 653     pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
 654     pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
 655     pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
 656     pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
 657     pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
 658     pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
 659     pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
 660     pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
 661     pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
 662
 663     pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
 664     pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
 665     pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
 666     pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
 667     pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
 668     pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
 669     pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
 670     pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
 671
 672     pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
 673     pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
 674     pC0  = gmx_simd_mul_f(pC0, t);
 675
 676     /* SELECT pB0 or pC0 for erfc() */
 677     mask     = gmx_simd_cmplt_f(two, y);
 678     res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
 679     res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
 680
 681     /* erfc(x<0) = 2-erfc(|x|) */
 682     mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
 683     res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
 684
 685     /* Select erf() or erfc() */
 686     mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
 687     res  = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), mask);
 688
 689     return res;
 690 }
 691
 692 /*! \brief SIMD float sin \& cos.
 693  *
 694  * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
 695  *
 696  * \param x The argument to evaluate sin/cos for
 697  * \param[out] sinval Sin(x)
 698  * \param[out] cosval Cos(x)
 699  *
 700  * This version achieves close to machine precision, but for very large
 701  * magnitudes of the argument we inherently begin to lose accuracy due to the
 702  * argument reduction, despite using extended precision arithmetics internally.
 703  */
 704 static gmx_inline void gmx_simdcall
 705 gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
 706 {
 707     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
 708     const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
 709     const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
 710     const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
 711     const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
 712     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
 713     const gmx_simd_float_t  const_sin2      = gmx_simd_set1_f(-1.9515295891e-4f);
 714     const gmx_simd_float_t  const_sin1      = gmx_simd_set1_f( 8.3321608736e-3f);
 715     const gmx_simd_float_t  const_sin0      = gmx_simd_set1_f(-1.6666654611e-1f);
 716     const gmx_simd_float_t  const_cos2      = gmx_simd_set1_f( 2.443315711809948e-5f);
 717     const gmx_simd_float_t  const_cos1      = gmx_simd_set1_f(-1.388731625493765e-3f);
 718     const gmx_simd_float_t  const_cos0      = gmx_simd_set1_f( 4.166664568298827e-2f);
 719     const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
 720     const gmx_simd_float_t  one             = gmx_simd_set1_f(1.0f);
 721     gmx_simd_float_t        ssign, csign;
 722     gmx_simd_float_t        x2, y, z, psin, pcos, sss, ccc;
 723     gmx_simd_fbool_t        mask;
 724 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
 725     const gmx_simd_fint32_t ione            = gmx_simd_set1_fi(1);
 726     const gmx_simd_fint32_t itwo            = gmx_simd_set1_fi(2);
 727     gmx_simd_fint32_t       iy;
 728
 729     z       = gmx_simd_mul_f(x, two_over_pi);
 730     iy      = gmx_simd_cvt_f2i(z);
 731     y       = gmx_simd_round_f(z);
 732
 733     mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
 734     ssign   = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
 735     csign   = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
 736 #else
 737     const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
 738     const gmx_simd_float_t  minusquarter    = gmx_simd_set1_f(-0.25f);
 739     gmx_simd_float_t        q;
 740     gmx_simd_fbool_t        m1, m2, m3;
 741
 742     /* The most obvious way to find the arguments quadrant in the unit circle
 743      * to calculate the sign is to use integer arithmetic, but that is not
 744      * present in all SIMD implementations. As an alternative, we have devised a
 745      * pure floating-point algorithm that uses truncation for argument reduction
 746      * so that we get a new value 0<=q<1 over the unit circle, and then
 747      * do floating-point comparisons with fractions. This is likely to be
 748      * slightly slower (~10%) due to the longer latencies of floating-point, so
 749      * we only use it when integer SIMD arithmetic is not present.
 750      */
 751     ssign   = x;
 752     x       = gmx_simd_fabs_f(x);
 753     /* It is critical that half-way cases are rounded down */
 754     z       = gmx_simd_fmadd_f(x, two_over_pi, half);
 755     y       = gmx_simd_trunc_f(z);
 756     q       = gmx_simd_mul_f(z, quarter);
 757     q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
 758     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
 759      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
 760      * This removes the 2*Pi periodicity without using any integer arithmetic.
 761      * First check if y had the value 2 or 3, set csign if true.
 762      */
 763     q       = gmx_simd_sub_f(q, half);
 764     /* If we have logical operations we can work directly on the signbit, which
 765      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
 766      * Thus, if you are altering defines to debug alternative code paths, the
 767      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
 768      * active or inactive - you will get errors if only one is used.
 769      */
 770 #    ifdef GMX_SIMD_HAVE_LOGICAL
 771     ssign   = gmx_simd_and_f(ssign, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
 772     csign   = gmx_simd_andnot_f(q, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
 773     ssign   = gmx_simd_xor_f(ssign, csign);
 774 #    else
 775     csign   = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
 776     // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
 777
 778     ssign   = gmx_simd_xor_sign_f(ssign, csign);    /* swap ssign if csign was set. */
 779 #    endif
 780     /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
 781     m1      = gmx_simd_cmplt_f(q, minusquarter);
 782     m2      = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
 783     m3      = gmx_simd_cmplt_f(q, quarter);
 784     m2      = gmx_simd_and_fb(m2, m3);
 785     mask    = gmx_simd_or_fb(m1, m2);
 786     /* where mask is FALSE, set sign. */
 787     csign   = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
 788 #endif
 789     x       = gmx_simd_fnmadd_f(y, argred0, x);
 790     x       = gmx_simd_fnmadd_f(y, argred1, x);
 791     x       = gmx_simd_fnmadd_f(y, argred2, x);
 792     x       = gmx_simd_fnmadd_f(y, argred3, x);
 793     x2      = gmx_simd_mul_f(x, x);
 794
 795     psin    = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
 796     psin    = gmx_simd_fmadd_f(psin, x2, const_sin0);
 797     psin    = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
 798     pcos    = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
 799     pcos    = gmx_simd_fmadd_f(pcos, x2, const_cos0);
 800     pcos    = gmx_simd_fmsub_f(pcos, x2, half);
 801     pcos    = gmx_simd_fmadd_f(pcos, x2, one);
 802
 803     sss     = gmx_simd_blendv_f(pcos, psin, mask);
 804     ccc     = gmx_simd_blendv_f(psin, pcos, mask);
 805     /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
 806 #ifdef GMX_SIMD_HAVE_LOGICAL
 807     *sinval = gmx_simd_xor_f(sss, ssign);
 808     *cosval = gmx_simd_xor_f(ccc, csign);
 809 #else
 810     *sinval = gmx_simd_xor_sign_f(sss, ssign);
 811     *cosval = gmx_simd_xor_sign_f(ccc, csign);
 812 #endif
 813 }
 814
 815 /*! \brief SIMD float sin(x).
 816  *
 817  * You should normally call the real-precision routine \ref gmx_simd_sin_r.
 818  *
 819  * \param x The argument to evaluate sin for
 820  * \result Sin(x)
 821  *
 822  * \attention Do NOT call both sin & cos if you need both results, since each of them
 823  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
 824  */
 825 static gmx_inline gmx_simd_float_t gmx_simdcall
 826 gmx_simd_sin_f(gmx_simd_float_t x)
 827 {
 828     gmx_simd_float_t s, c;
 829     gmx_simd_sincos_f(x, &s, &c);
 830     return s;
 831 }
 832
 833 /*! \brief SIMD float cos(x).
 834  *
 835  * You should normally call the real-precision routine \ref gmx_simd_cos_r.
 836  *
 837  * \param x The argument to evaluate cos for
 838  * \result Cos(x)
 839  *
 840  * \attention Do NOT call both sin & cos if you need both results, since each of them
 841  * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
 842  */
 843 static gmx_inline gmx_simd_float_t gmx_simdcall
 844 gmx_simd_cos_f(gmx_simd_float_t x)
 845 {
 846     gmx_simd_float_t s, c;
 847     gmx_simd_sincos_f(x, &s, &c);
 848     return c;
 849 }
 850
 851 /*! \brief SIMD float tan(x).
 852  *
 853  * You should normally call the real-precision routine \ref gmx_simd_tan_r.
 854  *
 855  * \param x The argument to evaluate tan for
 856  * \result Tan(x)
 857  */
 858 static gmx_inline gmx_simd_float_t gmx_simdcall
 859 gmx_simd_tan_f(gmx_simd_float_t x)
 860 {
 861     const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
 862     const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
 863     const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
 864     const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
 865     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
 866     const gmx_simd_float_t  CT6             = gmx_simd_set1_f(0.009498288995810566122993911);
 867     const gmx_simd_float_t  CT5             = gmx_simd_set1_f(0.002895755790837379295226923);
 868     const gmx_simd_float_t  CT4             = gmx_simd_set1_f(0.02460087336161924491836265);
 869     const gmx_simd_float_t  CT3             = gmx_simd_set1_f(0.05334912882656359828045988);
 870     const gmx_simd_float_t  CT2             = gmx_simd_set1_f(0.1333989091464957704418495);
 871     const gmx_simd_float_t  CT1             = gmx_simd_set1_f(0.3333307599244198227797507);
 872
 873     gmx_simd_float_t        x2, p, y, z;
 874     gmx_simd_fbool_t        mask;
 875
 876 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
 877     gmx_simd_fint32_t  iy;
 878     gmx_simd_fint32_t  ione = gmx_simd_set1_fi(1);
 879
 880     z       = gmx_simd_mul_f(x, two_over_pi);
 881     iy      = gmx_simd_cvt_f2i(z);
 882     y       = gmx_simd_round_f(z);
 883     mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
 884
 885     x       = gmx_simd_fnmadd_f(y, argred0, x);
 886     x       = gmx_simd_fnmadd_f(y, argred1, x);
 887     x       = gmx_simd_fnmadd_f(y, argred2, x);
 888     x       = gmx_simd_fnmadd_f(y, argred3, x);
 889     x       = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
 890 #else
 891     const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
 892     const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
 893     const gmx_simd_float_t  threequarter    = gmx_simd_set1_f(0.75f);
 894     gmx_simd_float_t        w, q;
 895     gmx_simd_fbool_t        m1, m2, m3;
 896
 897     w       = gmx_simd_fabs_f(x);
 898     z       = gmx_simd_fmadd_f(w, two_over_pi, half);
 899     y       = gmx_simd_trunc_f(z);
 900     q       = gmx_simd_mul_f(z, quarter);
 901     q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
 902     m1      = gmx_simd_cmple_f(quarter, q);
 903     m2      = gmx_simd_cmplt_f(q, half);
 904     m3      = gmx_simd_cmple_f(threequarter, q);
 905     m1      = gmx_simd_and_fb(m1, m2);
 906     mask    = gmx_simd_or_fb(m1, m3);
 907     w       = gmx_simd_fnmadd_f(y, argred0, w);
 908     w       = gmx_simd_fnmadd_f(y, argred1, w);
 909     w       = gmx_simd_fnmadd_f(y, argred2, w);
 910     w       = gmx_simd_fnmadd_f(y, argred3, w);
 911
 912     w       = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
 913     x       = gmx_simd_xor_sign_f(w, x);
 914 #endif
 915     x2      = gmx_simd_mul_f(x, x);
 916     p       = gmx_simd_fmadd_f(CT6, x2, CT5);
 917     p       = gmx_simd_fmadd_f(p, x2, CT4);
 918     p       = gmx_simd_fmadd_f(p, x2, CT3);
 919     p       = gmx_simd_fmadd_f(p, x2, CT2);
 920     p       = gmx_simd_fmadd_f(p, x2, CT1);
 921     p       = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
 922
 923     p       = gmx_simd_blendv_f( p, gmx_simd_inv_f(p), mask);
 924     return p;
 925 }
 926
 927 /*! \brief SIMD float asin(x).
 928  *
 929  * You should normally call the real-precision routine \ref gmx_simd_asin_r.
 930  *
 931  * \param x The argument to evaluate asin for
 932  * \result Asin(x)
 933  */
 934 static gmx_inline gmx_simd_float_t gmx_simdcall
 935 gmx_simd_asin_f(gmx_simd_float_t x)
 936 {
 937     const gmx_simd_float_t limitlow   = gmx_simd_set1_f(1e-4f);
 938     const gmx_simd_float_t half       = gmx_simd_set1_f(0.5f);
 939     const gmx_simd_float_t one        = gmx_simd_set1_f(1.0f);
 940     const gmx_simd_float_t halfpi     = gmx_simd_set1_f((float)M_PI/2.0f);
 941     const gmx_simd_float_t CC5        = gmx_simd_set1_f(4.2163199048E-2f);
 942     const gmx_simd_float_t CC4        = gmx_simd_set1_f(2.4181311049E-2f);
 943     const gmx_simd_float_t CC3        = gmx_simd_set1_f(4.5470025998E-2f);
 944     const gmx_simd_float_t CC2        = gmx_simd_set1_f(7.4953002686E-2f);
 945     const gmx_simd_float_t CC1        = gmx_simd_set1_f(1.6666752422E-1f);
 946     gmx_simd_float_t       xabs;
 947     gmx_simd_float_t       z, z1, z2, q, q1, q2;
 948     gmx_simd_float_t       pA, pB;
 949     gmx_simd_fbool_t       mask;
 950
 951     xabs  = gmx_simd_fabs_f(x);
 952     mask  = gmx_simd_cmplt_f(half, xabs);
 953     z1    = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
 954     q1    = gmx_simd_mul_f(z1, gmx_simd_invsqrt_f(z1));
 955     q1    = gmx_simd_blendnotzero_f(q1, gmx_simd_cmpeq_f(xabs, one));
 956     q2    = xabs;
 957     z2    = gmx_simd_mul_f(q2, q2);
 958     z     = gmx_simd_blendv_f(z2, z1, mask);
 959     q     = gmx_simd_blendv_f(q2, q1, mask);
 960
 961     z2    = gmx_simd_mul_f(z, z);
 962     pA    = gmx_simd_fmadd_f(CC5, z2, CC3);
 963     pB    = gmx_simd_fmadd_f(CC4, z2, CC2);
 964     pA    = gmx_simd_fmadd_f(pA, z2, CC1);
 965     pA    = gmx_simd_mul_f(pA, z);
 966     z     = gmx_simd_fmadd_f(pB, z2, pA);
 967     z     = gmx_simd_fmadd_f(z, q, q);
 968     q2    = gmx_simd_sub_f(halfpi, z);
 969     q2    = gmx_simd_sub_f(q2, z);
 970     z     = gmx_simd_blendv_f(z, q2, mask);
 971
 972     mask  = gmx_simd_cmplt_f(limitlow, xabs);
 973     z     = gmx_simd_blendv_f( xabs, z, mask );
 974     z     = gmx_simd_xor_sign_f(z, x);
 975
 976     return z;
 977 }
 978
 979 /*! \brief SIMD float acos(x).
 980  *
 981  * You should normally call the real-precision routine \ref gmx_simd_acos_r.
 982  *
 983  * \param x The argument to evaluate acos for
 984  * \result Acos(x)
 985  */
 986 static gmx_inline gmx_simd_float_t gmx_simdcall
 987 gmx_simd_acos_f(gmx_simd_float_t x)
 988 {
 989     const gmx_simd_float_t one       = gmx_simd_set1_f(1.0f);
 990     const gmx_simd_float_t half      = gmx_simd_set1_f(0.5f);
 991     const gmx_simd_float_t pi        = gmx_simd_set1_f((float)M_PI);
 992     const gmx_simd_float_t halfpi    = gmx_simd_set1_f((float)M_PI/2.0f);
 993     gmx_simd_float_t       xabs;
 994     gmx_simd_float_t       z, z1, z2, z3;
 995     gmx_simd_fbool_t       mask1, mask2;
 996
 997     xabs  = gmx_simd_fabs_f(x);
 998     mask1 = gmx_simd_cmplt_f(half, xabs);
 999     mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
1000
1001     z     = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1002     z     = gmx_simd_mul_f(z, gmx_simd_invsqrt_f(z));
1003     z     = gmx_simd_blendnotzero_f(z, gmx_simd_cmpeq_f(xabs, one));
1004     z     = gmx_simd_blendv_f(x, z, mask1);
1005     z     = gmx_simd_asin_f(z);
1006
1007     z2    = gmx_simd_add_f(z, z);
1008     z1    = gmx_simd_sub_f(pi, z2);
1009     z3    = gmx_simd_sub_f(halfpi, z);
1010     z     = gmx_simd_blendv_f(z1, z2, mask2);
1011     z     = gmx_simd_blendv_f(z3, z, mask1);
1012
1013     return z;
1014 }
1015
1016 /*! \brief SIMD float asin(x).
1017  *
1018  * You should normally call the real-precision routine \ref gmx_simd_atan_r.
1019  *
1020  * \param x The argument to evaluate atan for
1021  * \result Atan(x), same argument/value range as standard math library.
1022  */
1023 static gmx_inline gmx_simd_float_t gmx_simdcall
1024 gmx_simd_atan_f(gmx_simd_float_t x)
1025 {
1026     const gmx_simd_float_t halfpi    = gmx_simd_set1_f(M_PI/2);
1027     const gmx_simd_float_t CA17      = gmx_simd_set1_f(0.002823638962581753730774f);
1028     const gmx_simd_float_t CA15      = gmx_simd_set1_f(-0.01595690287649631500244f);
1029     const gmx_simd_float_t CA13      = gmx_simd_set1_f(0.04250498861074447631836f);
1030     const gmx_simd_float_t CA11      = gmx_simd_set1_f(-0.07489009201526641845703f);
1031     const gmx_simd_float_t CA9       = gmx_simd_set1_f(0.1063479334115982055664f);
1032     const gmx_simd_float_t CA7       = gmx_simd_set1_f(-0.1420273631811141967773f);
1033     const gmx_simd_float_t CA5       = gmx_simd_set1_f(0.1999269574880599975585f);
1034     const gmx_simd_float_t CA3       = gmx_simd_set1_f(-0.3333310186862945556640f);
1035     gmx_simd_float_t       x2, x3, x4, pA, pB;
1036     gmx_simd_fbool_t       mask, mask2;
1037
1038     mask  = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1039     x     = gmx_simd_fabs_f(x);
1040     mask2 = gmx_simd_cmplt_f(gmx_simd_set1_f(1.0f), x);
1041     x     = gmx_simd_blendv_f(x, gmx_simd_inv_f(x), mask2);
1042
1043     x2    = gmx_simd_mul_f(x, x);
1044     x3    = gmx_simd_mul_f(x2, x);
1045     x4    = gmx_simd_mul_f(x2, x2);
1046     pA    = gmx_simd_fmadd_f(CA17, x4, CA13);
1047     pB    = gmx_simd_fmadd_f(CA15, x4, CA11);
1048     pA    = gmx_simd_fmadd_f(pA, x4, CA9);
1049     pB    = gmx_simd_fmadd_f(pB, x4, CA7);
1050     pA    = gmx_simd_fmadd_f(pA, x4, CA5);
1051     pB    = gmx_simd_fmadd_f(pB, x4, CA3);
1052     pA    = gmx_simd_fmadd_f(pA, x2, pB);
1053     pA    = gmx_simd_fmadd_f(pA, x3, x);
1054
1055     pA    = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
1056     pA    = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
1057
1058     return pA;
1059 }
1060
1061 /*! \brief SIMD float atan2(y,x).
1062  *
1063  * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
1064  *
1065  * \param y Y component of vector, any quartile
1066  * \param x X component of vector, any quartile
1067  * \result Atan(y,x), same argument/value range as standard math library.
1068  *
1069  * \note This routine should provide correct results for all finite
1070  * non-zero or positive-zero arguments. However, negative zero arguments will
1071  * be treated as positive zero, which means the return value will deviate from
1072  * the standard math library atan2(y,x) for those cases. That should not be
1073  * of any concern in Gromacs, and in particular it will not affect calculations
1074  * of angles from vectors.
1075  */
1076 static gmx_inline gmx_simd_float_t gmx_simdcall
1077 gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
1078 {
1079     const gmx_simd_float_t pi          = gmx_simd_set1_f(M_PI);
1080     const gmx_simd_float_t halfpi      = gmx_simd_set1_f(M_PI/2.0);
1081     gmx_simd_float_t       xinv, p, aoffset;
1082     gmx_simd_fbool_t       mask_x0, mask_y0, mask_xlt0, mask_ylt0;
1083
1084     mask_x0   = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
1085     mask_y0   = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
1086     mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1087     mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
1088
1089     aoffset   = gmx_simd_blendzero_f(halfpi, mask_x0);
1090     aoffset   = gmx_simd_blendnotzero_f(aoffset, mask_y0);
1091
1092     aoffset   = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
1093     aoffset   = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
1094
1095     xinv      = gmx_simd_blendnotzero_f(gmx_simd_inv_f(x), mask_x0);
1096     p         = gmx_simd_mul_f(y, xinv);
1097     p         = gmx_simd_atan_f(p);
1098     p         = gmx_simd_add_f(p, aoffset);
1099
1100     return p;
1101 }
1102
1103 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1104  *
1105  * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
1106  *
1107  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1108  * \result Correction factor to coulomb force - see below for details.
1109  *
1110  * This routine is meant to enable analytical evaluation of the
1111  * direct-space PME electrostatic force to avoid tables.
1112  *
1113  * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1114  * are some problems evaluating that:
1115  *
1116  * First, the error function is difficult (read: expensive) to
1117  * approxmiate accurately for intermediate to large arguments, and
1118  * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1119  * Second, we now try to avoid calculating potentials in Gromacs but
1120  * use forces directly.
1121  *
1122  * We can simply things slight by noting that the PME part is really
1123  * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1124  * \f[
1125  * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1126  * \f]
1127  * The first term we already have from the inverse square root, so
1128  * that we can leave out of this routine.
1129  *
1130  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1131  * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1132  * the range used for the minimax fit. Use your favorite plotting program
1133  * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1134  *
1135  * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1136  * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1137  * then only use even powers. This is another minor optimization, since
1138  * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1139  * the vector between the two atoms to get the vectorial force. The
1140  * fastest flops are the ones we can avoid calculating!
1141  *
1142  * So, here's how it should be used:
1143  *
1144  * 1. Calculate \f$r^2\f$.
1145  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1146  * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1147  * 4. The return value is the expression:
1148  *
1149  * \f[
1150  *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1151  * \f]
1152  *
1153  * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1154  *
1155  *  \f[
1156  *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1157  *  \f]
1158  *
1159  *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1160  *
1161  *  \f[
1162  *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1163  *  \f]
1164  *
1165  *    With a bit of math exercise you should be able to confirm that
1166  *    this is exactly
1167  *
1168  *  \f[
1169  *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1170  *  \f]
1171  *
1172  * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1173  *    and you have your force (divided by \f$r\f$). A final multiplication
1174  *    with the vector connecting the two particles and you have your
1175  *    vectorial force to add to the particles.
1176  *
1177  * This approximation achieves an error slightly lower than 1e-6
1178  * in single precision and 1e-11 in double precision
1179  * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1180  * when added to \f$1/r\f$ the error will be insignificant.
1181  * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1182  *
1183  */
1184 static gmx_inline gmx_simd_float_t gmx_simdcall
1185 gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
1186 {
1187     const gmx_simd_float_t  FN6      = gmx_simd_set1_f(-1.7357322914161492954e-8f);
1188     const gmx_simd_float_t  FN5      = gmx_simd_set1_f(1.4703624142580877519e-6f);
1189     const gmx_simd_float_t  FN4      = gmx_simd_set1_f(-0.000053401640219807709149f);
1190     const gmx_simd_float_t  FN3      = gmx_simd_set1_f(0.0010054721316683106153f);
1191     const gmx_simd_float_t  FN2      = gmx_simd_set1_f(-0.019278317264888380590f);
1192     const gmx_simd_float_t  FN1      = gmx_simd_set1_f(0.069670166153766424023f);
1193     const gmx_simd_float_t  FN0      = gmx_simd_set1_f(-0.75225204789749321333f);
1194
1195     const gmx_simd_float_t  FD4      = gmx_simd_set1_f(0.0011193462567257629232f);
1196     const gmx_simd_float_t  FD3      = gmx_simd_set1_f(0.014866955030185295499f);
1197     const gmx_simd_float_t  FD2      = gmx_simd_set1_f(0.11583842382862377919f);
1198     const gmx_simd_float_t  FD1      = gmx_simd_set1_f(0.50736591960530292870f);
1199     const gmx_simd_float_t  FD0      = gmx_simd_set1_f(1.0f);
1200
1201     gmx_simd_float_t        z4;
1202     gmx_simd_float_t        polyFN0, polyFN1, polyFD0, polyFD1;
1203
1204     z4             = gmx_simd_mul_f(z2, z2);
1205
1206     polyFD0        = gmx_simd_fmadd_f(FD4, z4, FD2);
1207     polyFD1        = gmx_simd_fmadd_f(FD3, z4, FD1);
1208     polyFD0        = gmx_simd_fmadd_f(polyFD0, z4, FD0);
1209     polyFD0        = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
1210
1211     polyFD0        = gmx_simd_inv_f(polyFD0);
1212
1213     polyFN0        = gmx_simd_fmadd_f(FN6, z4, FN4);
1214     polyFN1        = gmx_simd_fmadd_f(FN5, z4, FN3);
1215     polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN2);
1216     polyFN1        = gmx_simd_fmadd_f(polyFN1, z4, FN1);
1217     polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN0);
1218     polyFN0        = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
1219
1220     return gmx_simd_mul_f(polyFN0, polyFD0);
1221 }
1222
1223
1224
1225 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1226  *
1227  * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
1228  *
1229  * \param z2 \f$(r \beta)^2\f$ - see below for details.
1230  * \result Correction factor to coulomb potential - see below for details.
1231  *
1232  * See \ref gmx_simd_pmecorrF_f for details about the approximation.
1233  *
1234  * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1235  * as the input argument.
1236  *
1237  * Here's how it should be used:
1238  *
1239  * 1. Calculate \f$r^2\f$.
1240  * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1241  * 3. Evaluate this routine with z^2 as the argument.
1242  * 4. The return value is the expression:
1243  *
1244  *  \f[
1245  *   \frac{\mbox{erf}(z)}{z}
1246  *  \f]
1247  *
1248  * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1249  *
1250  *  \f[
1251  *    \frac{\mbox{erf}(r \beta)}{r}
1252  *  \f]
1253  *
1254  * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1255  *    and you have your potential.
1256  *
1257  * This approximation achieves an error slightly lower than 1e-6
1258  * in single precision and 4e-11 in double precision
1259  * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1260  * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1261  * when added to \f$1/r\f$ the error will be insignificant.
1262  * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1263  */
1264 static gmx_inline gmx_simd_float_t gmx_simdcall
1265 gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
1266 {
1267     const gmx_simd_float_t  VN6      = gmx_simd_set1_f(1.9296833005951166339e-8f);
1268     const gmx_simd_float_t  VN5      = gmx_simd_set1_f(-1.4213390571557850962e-6f);
1269     const gmx_simd_float_t  VN4      = gmx_simd_set1_f(0.000041603292906656984871f);
1270     const gmx_simd_float_t  VN3      = gmx_simd_set1_f(-0.00013134036773265025626f);
1271     const gmx_simd_float_t  VN2      = gmx_simd_set1_f(0.038657983986041781264f);
1272     const gmx_simd_float_t  VN1      = gmx_simd_set1_f(0.11285044772717598220f);
1273     const gmx_simd_float_t  VN0      = gmx_simd_set1_f(1.1283802385263030286f);
1274
1275     const gmx_simd_float_t  VD3      = gmx_simd_set1_f(0.0066752224023576045451f);
1276     const gmx_simd_float_t  VD2      = gmx_simd_set1_f(0.078647795836373922256f);
1277     const gmx_simd_float_t  VD1      = gmx_simd_set1_f(0.43336185284710920150f);
1278     const gmx_simd_float_t  VD0      = gmx_simd_set1_f(1.0f);
1279
1280     gmx_simd_float_t        z4;
1281     gmx_simd_float_t        polyVN0, polyVN1, polyVD0, polyVD1;
1282
1283     z4             = gmx_simd_mul_f(z2, z2);
1284
1285     polyVD1        = gmx_simd_fmadd_f(VD3, z4, VD1);
1286     polyVD0        = gmx_simd_fmadd_f(VD2, z4, VD0);
1287     polyVD0        = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
1288
1289     polyVD0        = gmx_simd_inv_f(polyVD0);
1290
1291     polyVN0        = gmx_simd_fmadd_f(VN6, z4, VN4);
1292     polyVN1        = gmx_simd_fmadd_f(VN5, z4, VN3);
1293     polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN2);
1294     polyVN1        = gmx_simd_fmadd_f(polyVN1, z4, VN1);
1295     polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN0);
1296     polyVN0        = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
1297
1298     return gmx_simd_mul_f(polyVN0, polyVD0);
1299 }
1300 #endif
1301
1302 /*! \} */
1303
1304 #ifdef GMX_SIMD_HAVE_DOUBLE
1305
1306 /*! \name Double precision SIMD math functions
1307  *
1308  *  \note In most cases you should use the real-precision functions instead.
1309  *  \{
1310  */
1311
1312 /****************************************
1313  * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1314  ****************************************/
1315
1316 /*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
1317  *
1318  * \copydetails gmx_simd_sum4_f
1319  */
1320 static gmx_inline gmx_simd_double_t gmx_simdcall
1321 gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
1322                 gmx_simd_double_t c, gmx_simd_double_t d)
1323 {
1324     return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
1325 }
1326
1327 /*! \brief Return -a if b is negative, SIMD double.
1328  *
1329  * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
1330  *
1331  * \param a Values to set sign for
1332  * \param b Values used to set sign
1333  * \return if b is negative, the sign of a will be changed.
1334  *
1335  * This is equivalent to doing an xor operation on a with the sign bit of b,
1336  * with the exception that negative zero is not considered to be negative
1337  * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
1338  */
1339 static gmx_inline gmx_simd_double_t gmx_simdcall
1340 gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
1341 {
1342 #ifdef GMX_SIMD_HAVE_LOGICAL
1343     return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), b));
1344 #else
1345     return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
1346 #endif
1347 }
1348
1349 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1350  *
1351  * \copydetails gmx_simd_rsqrt_iter_f
1352  */
1353 static gmx_inline gmx_simd_double_t gmx_simdcall
1354 gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1355 {
1356 #ifdef GMX_SIMD_HAVE_FMA
1357     return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
1358 #else
1359     return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
1360 #endif
1361 }
1362
1363
1364 /*! \brief Calculate 1/sqrt(x) for SIMD double
1365  *
1366  * \copydetails gmx_simd_invsqrt_f
1367  */
1368 static gmx_inline gmx_simd_double_t gmx_simdcall
1369 gmx_simd_invsqrt_d(gmx_simd_double_t x)
1370 {
1371     gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
1372 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1373     lu = gmx_simd_rsqrt_iter_d(lu, x);
1374 #endif
1375 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1376     lu = gmx_simd_rsqrt_iter_d(lu, x);
1377 #endif
1378 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1379     lu = gmx_simd_rsqrt_iter_d(lu, x);
1380 #endif
1381 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1382     lu = gmx_simd_rsqrt_iter_d(lu, x);
1383 #endif
1384     return lu;
1385 }
1386
1387 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1388  *
1389  * \copydetails gmx_simd_invsqrt_pair_f
1390  */
1391 static gmx_inline void gmx_simdcall
1392 gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
1393                         gmx_simd_double_t *out0, gmx_simd_double_t *out1)
1394 {
1395 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1396     gmx_simd_float_t  xf  = gmx_simd_cvt_dd2f(x0, x1);
1397     gmx_simd_float_t  luf = gmx_simd_rsqrt_f(xf);
1398     gmx_simd_double_t lu0, lu1;
1399     /* Intermediate target is single - mantissa+1 bits */
1400 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1401     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1402 #endif
1403 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1404     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1405 #endif
1406 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1407     luf = gmx_simd_rsqrt_iter_f(luf, xf);
1408 #endif
1409     gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
1410     /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
1411 #if (GMX_SIMD_MATH_TARGET_SINGLE_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1412     lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1413     lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1414 #endif
1415 #if (GMX_SIMD_MATH_TARGET_SINGLE_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1416     lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1417     lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1418 #endif
1419     *out0 = lu0;
1420     *out1 = lu1;
1421 #else
1422     *out0 = gmx_simd_invsqrt_d(x0);
1423     *out1 = gmx_simd_invsqrt_d(x1);
1424 #endif
1425 }
1426
1427 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1428  *
1429  * \copydetails gmx_simd_rcp_iter_f
1430  */
1431 static gmx_inline gmx_simd_double_t gmx_simdcall
1432 gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1433 {
1434     return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
1435 }
1436
1437 /*! \brief Calculate 1/x for SIMD double.
1438  *
1439  * \copydetails gmx_simd_inv_f
1440  */
1441 static gmx_inline gmx_simd_double_t gmx_simdcall
1442 gmx_simd_inv_d(gmx_simd_double_t x)
1443 {
1444     gmx_simd_double_t lu = gmx_simd_rcp_d(x);
1445 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1446     lu = gmx_simd_rcp_iter_d(lu, x);
1447 #endif
1448 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1449     lu = gmx_simd_rcp_iter_d(lu, x);
1450 #endif
1451 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1452     lu = gmx_simd_rcp_iter_d(lu, x);
1453 #endif
1454 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1455     lu = gmx_simd_rcp_iter_d(lu, x);
1456 #endif
1457     return lu;
1458 }
1459
1460 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1461  *
1462  * \copydetails gmx_simd_sqrt_f
1463  */
1464 static gmx_inline gmx_simd_double_t gmx_simdcall
1465 gmx_simd_sqrt_d(gmx_simd_double_t x)
1466 {
1467     gmx_simd_dbool_t   mask;
1468     gmx_simd_double_t  res;
1469
1470     mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
1471     res  = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_d(x), mask);
1472     return gmx_simd_mul_d(res, x);
1473 }
1474
1475 /*! \brief SIMD double log(x). This is the natural logarithm.
1476  *
1477  * \copydetails gmx_simd_log_f
1478  */
1479 static gmx_inline gmx_simd_double_t gmx_simdcall
1480 gmx_simd_log_d(gmx_simd_double_t x)
1481 {
1482     const gmx_simd_double_t  half       = gmx_simd_set1_d(0.5);
1483     const gmx_simd_double_t  one        = gmx_simd_set1_d(1.0);
1484     const gmx_simd_double_t  sqrt2      = gmx_simd_set1_d(sqrt(2.0));
1485     const gmx_simd_double_t  corr       = gmx_simd_set1_d(0.693147180559945286226764);
1486     const gmx_simd_double_t  CL15       = gmx_simd_set1_d(0.148197055177935105296783);
1487     const gmx_simd_double_t  CL13       = gmx_simd_set1_d(0.153108178020442575739679);
1488     const gmx_simd_double_t  CL11       = gmx_simd_set1_d(0.181837339521549679055568);
1489     const gmx_simd_double_t  CL9        = gmx_simd_set1_d(0.22222194152736701733275);
1490     const gmx_simd_double_t  CL7        = gmx_simd_set1_d(0.285714288030134544449368);
1491     const gmx_simd_double_t  CL5        = gmx_simd_set1_d(0.399999999989941956712869);
1492     const gmx_simd_double_t  CL3        = gmx_simd_set1_d(0.666666666666685503450651);
1493     const gmx_simd_double_t  CL1        = gmx_simd_set1_d(2.0);
1494     gmx_simd_double_t        fexp, x2, p;
1495     gmx_simd_dbool_t         mask;
1496
1497     fexp  = gmx_simd_get_exponent_d(x);
1498     x     = gmx_simd_get_mantissa_d(x);
1499
1500     mask  = gmx_simd_cmplt_d(sqrt2, x);
1501     /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
1502     fexp  = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
1503     x     = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
1504
1505     x     = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
1506     x2    = gmx_simd_mul_d(x, x);
1507
1508     p     = gmx_simd_fmadd_d(CL15, x2, CL13);
1509     p     = gmx_simd_fmadd_d(p, x2, CL11);
1510     p     = gmx_simd_fmadd_d(p, x2, CL9);
1511     p     = gmx_simd_fmadd_d(p, x2, CL7);
1512     p     = gmx_simd_fmadd_d(p, x2, CL5);
1513     p     = gmx_simd_fmadd_d(p, x2, CL3);
1514     p     = gmx_simd_fmadd_d(p, x2, CL1);
1515     p     = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
1516
1517     return p;
1518 }
1519
1520 /*! \brief SIMD double 2^x.
1521  *
1522  * \copydetails gmx_simd_exp2_f
1523  */
1524 static gmx_inline gmx_simd_double_t gmx_simdcall
1525 gmx_simd_exp2_d(gmx_simd_double_t x)
1526 {
1527     const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
1528     const gmx_simd_double_t  CE11          = gmx_simd_set1_d(4.435280790452730022081181e-10);
1529     const gmx_simd_double_t  CE10          = gmx_simd_set1_d(7.074105630863314448024247e-09);
1530     const gmx_simd_double_t  CE9           = gmx_simd_set1_d(1.017819803432096698472621e-07);
1531     const gmx_simd_double_t  CE8           = gmx_simd_set1_d(1.321543308956718799557863e-06);
1532     const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.00001525273348995851746990884);
1533     const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.0001540353046251466849082632);
1534     const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.001333355814678995257307880);
1535     const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.009618129107588335039176502);
1536     const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.05550410866481992147457793);
1537     const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.2402265069591015620470894);
1538     const gmx_simd_double_t  CE1           = gmx_simd_set1_d(0.6931471805599453304615075);
1539     const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
1540     gmx_simd_double_t        fexppart;
1541     gmx_simd_double_t        intpart;
1542     gmx_simd_double_t        p;
1543     gmx_simd_dbool_t         valuemask;
1544
1545     fexppart  = gmx_simd_set_exponent_d(x);  /* rounds to nearest int internally */
1546     intpart   = gmx_simd_round_d(x);         /* use same rounding mode here */
1547     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
1548     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
1549     x         = gmx_simd_sub_d(x, intpart);
1550
1551     p         = gmx_simd_fmadd_d(CE11, x, CE10);
1552     p         = gmx_simd_fmadd_d(p, x, CE9);
1553     p         = gmx_simd_fmadd_d(p, x, CE8);
1554     p         = gmx_simd_fmadd_d(p, x, CE7);
1555     p         = gmx_simd_fmadd_d(p, x, CE6);
1556     p         = gmx_simd_fmadd_d(p, x, CE5);
1557     p         = gmx_simd_fmadd_d(p, x, CE4);
1558     p         = gmx_simd_fmadd_d(p, x, CE3);
1559     p         = gmx_simd_fmadd_d(p, x, CE2);
1560     p         = gmx_simd_fmadd_d(p, x, CE1);
1561     p         = gmx_simd_fmadd_d(p, x, one);
1562     x         = gmx_simd_mul_d(p, fexppart);
1563     return x;
1564 }
1565
1566 /*! \brief SIMD double exp(x).
1567  *
1568  * \copydetails gmx_simd_exp_f
1569  */
1570 static gmx_inline gmx_simd_double_t gmx_simdcall
1571 gmx_simd_exp_d(gmx_simd_double_t x)
1572 {
1573     const gmx_simd_double_t  argscale      = gmx_simd_set1_d(1.44269504088896340735992468100);
1574     const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
1575     const gmx_simd_double_t  invargscale0  = gmx_simd_set1_d(0.69314718055966295651160180568695068359375);
1576     const gmx_simd_double_t  invargscale1  = gmx_simd_set1_d(2.8235290563031577122588448175013436025525412068e-13);
1577     const gmx_simd_double_t  CE12          = gmx_simd_set1_d(2.078375306791423699350304e-09);
1578     const gmx_simd_double_t  CE11          = gmx_simd_set1_d(2.518173854179933105218635e-08);
1579     const gmx_simd_double_t  CE10          = gmx_simd_set1_d(2.755842049600488770111608e-07);
1580     const gmx_simd_double_t  CE9           = gmx_simd_set1_d(2.755691815216689746619849e-06);
1581     const gmx_simd_double_t  CE8           = gmx_simd_set1_d(2.480158383706245033920920e-05);
1582     const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.0001984127043518048611841321);
1583     const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.001388888889360258341755930);
1584     const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.008333333332907368102819109);
1585     const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.04166666666663836745814631);
1586     const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.1666666666666796929434570);
1587     const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.5);
1588     const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
1589     gmx_simd_double_t        fexppart;
1590     gmx_simd_double_t        intpart;
1591     gmx_simd_double_t        y, p;
1592     gmx_simd_dbool_t         valuemask;
1593
1594     y         = gmx_simd_mul_d(x, argscale);
1595     fexppart  = gmx_simd_set_exponent_d(y);  /* rounds to nearest int internally */
1596     intpart   = gmx_simd_round_d(y);         /* use same rounding mode here */
1597     valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
1598     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
1599
1600     /* Extended precision arithmetics */
1601     x         = gmx_simd_fnmadd_d(invargscale0, intpart, x);
1602     x         = gmx_simd_fnmadd_d(invargscale1, intpart, x);
1603
1604     p         = gmx_simd_fmadd_d(CE12, x, CE11);
1605     p         = gmx_simd_fmadd_d(p, x, CE10);
1606     p         = gmx_simd_fmadd_d(p, x, CE9);
1607     p         = gmx_simd_fmadd_d(p, x, CE8);
1608     p         = gmx_simd_fmadd_d(p, x, CE7);
1609     p         = gmx_simd_fmadd_d(p, x, CE6);
1610     p         = gmx_simd_fmadd_d(p, x, CE5);
1611     p         = gmx_simd_fmadd_d(p, x, CE4);
1612     p         = gmx_simd_fmadd_d(p, x, CE3);
1613     p         = gmx_simd_fmadd_d(p, x, CE2);
1614     p         = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
1615     x         = gmx_simd_mul_d(p, fexppart);
1616     return x;
1617 }
1618
1619 /*! \brief SIMD double erf(x).
1620  *
1621  * \copydetails gmx_simd_erf_f
1622  */
1623 static gmx_inline gmx_simd_double_t gmx_simdcall
1624 gmx_simd_erf_d(gmx_simd_double_t x)
1625 {
1626     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1627     const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
1628     const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
1629     const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
1630     const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
1631     const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
1632
1633     const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
1634     const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
1635     const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
1636     const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
1637     const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
1638     /* CAQ0 == 1.0 */
1639     const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
1640
1641     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1642     const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1643     const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
1644     const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
1645     const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
1646     const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
1647     const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
1648     const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
1649     const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
1650     const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
1651     const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
1652     const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
1653     const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
1654     const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
1655     const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
1656     /* CBQ0 == 1.0 */
1657
1658     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1659     const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
1660     const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
1661     const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
1662     const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
1663     const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
1664     const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
1665     const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
1666
1667     const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
1668     const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
1669     const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
1670     const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
1671     const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
1672     const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
1673     /* CCQ0 == 1.0 */
1674     const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
1675
1676     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
1677     const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
1678
1679     gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
1680     gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1681     gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1682     gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1683     gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
1684     gmx_simd_double_t       expmx2;
1685     gmx_simd_dbool_t        mask;
1686
1687     /* Calculate erf() */
1688     xabs     = gmx_simd_fabs_d(x);
1689     x2       = gmx_simd_mul_d(x, x);
1690     x4       = gmx_simd_mul_d(x2, x2);
1691
1692     PolyAP0  = gmx_simd_mul_d(CAP4, x4);
1693     PolyAP1  = gmx_simd_mul_d(CAP3, x4);
1694     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
1695     PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
1696     PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
1697     PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
1698     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
1699     PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
1700
1701     PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
1702     PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
1703     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
1704     PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
1705     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
1706     PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
1707     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
1708     PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
1709     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
1710     PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
1711
1712     res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
1713     res_erf  = gmx_simd_add_d(CAoffset, res_erf);
1714     res_erf  = gmx_simd_mul_d(x, res_erf);
1715
1716     /* Calculate erfc() in range [1,4.5] */
1717     t       = gmx_simd_sub_d(xabs, one);
1718     t2      = gmx_simd_mul_d(t, t);
1719
1720     PolyBP0  = gmx_simd_mul_d(CBP6, t2);
1721     PolyBP1  = gmx_simd_mul_d(CBP5, t2);
1722     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
1723     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
1724     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1725     PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
1726     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
1727     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
1728     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1729     PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
1730     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
1731     PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
1732
1733     PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
1734     PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
1735     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
1736     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
1737     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1738     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1739     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
1740     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
1741     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1742     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1743     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
1744     PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
1745     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
1746     PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
1747
1748     res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
1749
1750     res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
1751
1752     /* Calculate erfc() in range [4.5,inf] */
1753     w       = gmx_simd_inv_d(xabs);
1754     w2      = gmx_simd_mul_d(w, w);
1755
1756     PolyCP0  = gmx_simd_mul_d(CCP6, w2);
1757     PolyCP1  = gmx_simd_mul_d(CCP5, w2);
1758     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
1759     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
1760     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1761     PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
1762     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
1763     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
1764     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1765     PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
1766     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
1767     PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
1768
1769     PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
1770     PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
1771     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
1772     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
1773     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1774     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
1775     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
1776     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
1777     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1778     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
1779     PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
1780     PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
1781
1782     expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
1783
1784     res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
1785     res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
1786     res_erfcC = gmx_simd_mul_d(res_erfcC, w);
1787
1788     mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
1789     res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
1790
1791     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
1792
1793     /* erfc(x<0) = 2-erfc(|x|) */
1794     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
1795     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
1796
1797     /* Select erf() or erfc() */
1798     mask = gmx_simd_cmplt_d(xabs, one);
1799     res  = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask);
1800
1801     return res;
1802 }
1803
1804 /*! \brief SIMD double erfc(x).
1805  *
1806  * \copydetails gmx_simd_erfc_f
1807  */
1808 static gmx_inline gmx_simd_double_t gmx_simdcall
1809 gmx_simd_erfc_d(gmx_simd_double_t x)
1810 {
1811     /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1812     const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
1813     const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
1814     const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
1815     const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
1816     const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
1817
1818     const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
1819     const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
1820     const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
1821     const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
1822     const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
1823     /* CAQ0 == 1.0 */
1824     const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
1825
1826     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1827     const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1828     const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
1829     const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
1830     const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
1831     const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
1832     const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
1833     const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
1834     const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
1835     const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
1836     const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
1837     const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
1838     const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
1839     const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
1840     const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
1841     /* CBQ0 == 1.0 */
1842
1843     /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1844     const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
1845     const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
1846     const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
1847     const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
1848     const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
1849     const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
1850     const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
1851
1852     const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
1853     const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
1854     const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
1855     const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
1856     const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
1857     const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
1858     /* CCQ0 == 1.0 */
1859     const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
1860
1861     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
1862     const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
1863
1864     gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
1865     gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1866     gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1867     gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1868     gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
1869     gmx_simd_double_t       expmx2;
1870     gmx_simd_dbool_t        mask;
1871
1872     /* Calculate erf() */
1873     xabs     = gmx_simd_fabs_d(x);
1874     x2       = gmx_simd_mul_d(x, x);
1875     x4       = gmx_simd_mul_d(x2, x2);
1876
1877     PolyAP0  = gmx_simd_mul_d(CAP4, x4);
1878     PolyAP1  = gmx_simd_mul_d(CAP3, x4);
1879     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
1880     PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
1881     PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
1882     PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
1883     PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
1884     PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
1885
1886     PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
1887     PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
1888     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
1889     PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
1890     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
1891     PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
1892     PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
1893     PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
1894     PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
1895     PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
1896
1897     res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
1898     res_erf  = gmx_simd_add_d(CAoffset, res_erf);
1899     res_erf  = gmx_simd_mul_d(x, res_erf);
1900
1901     /* Calculate erfc() in range [1,4.5] */
1902     t       = gmx_simd_sub_d(xabs, one);
1903     t2      = gmx_simd_mul_d(t, t);
1904
1905     PolyBP0  = gmx_simd_mul_d(CBP6, t2);
1906     PolyBP1  = gmx_simd_mul_d(CBP5, t2);
1907     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
1908     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
1909     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1910     PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
1911     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
1912     PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
1913     PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
1914     PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
1915     PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
1916     PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
1917
1918     PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
1919     PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
1920     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
1921     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
1922     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1923     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1924     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
1925     PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
1926     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1927     PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1928     PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
1929     PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
1930     PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
1931     PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
1932
1933     res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
1934
1935     res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
1936
1937     /* Calculate erfc() in range [4.5,inf] */
1938     w       = gmx_simd_inv_d(xabs);
1939     w2      = gmx_simd_mul_d(w, w);
1940
1941     PolyCP0  = gmx_simd_mul_d(CCP6, w2);
1942     PolyCP1  = gmx_simd_mul_d(CCP5, w2);
1943     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
1944     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
1945     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1946     PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
1947     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
1948     PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
1949     PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
1950     PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
1951     PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
1952     PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
1953
1954     PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
1955     PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
1956     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
1957     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
1958     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1959     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
1960     PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
1961     PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
1962     PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
1963     PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
1964     PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
1965     PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
1966
1967     expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
1968
1969     res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
1970     res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
1971     res_erfcC = gmx_simd_mul_d(res_erfcC, w);
1972
1973     mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
1974     res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
1975
1976     res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
1977
1978     /* erfc(x<0) = 2-erfc(|x|) */
1979     mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
1980     res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
1981
1982     /* Select erf() or erfc() */
1983     mask = gmx_simd_cmplt_d(xabs, one);
1984     res  = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask);
1985
1986     return res;
1987 }
1988
1989 /*! \brief SIMD double sin \& cos.
1990  *
1991  * \copydetails gmx_simd_sincos_f
1992  */
1993 static gmx_inline void gmx_simdcall
1994 gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
1995 {
1996     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
1997     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
1998     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
1999     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
2000     const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
2001     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
2002     const gmx_simd_double_t  const_sin5      = gmx_simd_set1_d( 1.58938307283228937328511e-10);
2003     const gmx_simd_double_t  const_sin4      = gmx_simd_set1_d(-2.50506943502539773349318e-08);
2004     const gmx_simd_double_t  const_sin3      = gmx_simd_set1_d( 2.75573131776846360512547e-06);
2005     const gmx_simd_double_t  const_sin2      = gmx_simd_set1_d(-0.000198412698278911770864914);
2006     const gmx_simd_double_t  const_sin1      = gmx_simd_set1_d( 0.0083333333333191845961746);
2007     const gmx_simd_double_t  const_sin0      = gmx_simd_set1_d(-0.166666666666666130709393);
2008
2009     const gmx_simd_double_t  const_cos7      = gmx_simd_set1_d(-1.13615350239097429531523e-11);
2010     const gmx_simd_double_t  const_cos6      = gmx_simd_set1_d( 2.08757471207040055479366e-09);
2011     const gmx_simd_double_t  const_cos5      = gmx_simd_set1_d(-2.75573144028847567498567e-07);
2012     const gmx_simd_double_t  const_cos4      = gmx_simd_set1_d( 2.48015872890001867311915e-05);
2013     const gmx_simd_double_t  const_cos3      = gmx_simd_set1_d(-0.00138888888888714019282329);
2014     const gmx_simd_double_t  const_cos2      = gmx_simd_set1_d( 0.0416666666666665519592062);
2015     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
2016     const gmx_simd_double_t  one             = gmx_simd_set1_d(1.0);
2017     gmx_simd_double_t        ssign, csign;
2018     gmx_simd_double_t        x2, y, z, psin, pcos, sss, ccc;
2019     gmx_simd_dbool_t         mask;
2020 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2021     const gmx_simd_dint32_t  ione            = gmx_simd_set1_di(1);
2022     const gmx_simd_dint32_t  itwo            = gmx_simd_set1_di(2);
2023     gmx_simd_dint32_t        iy;
2024
2025     z       = gmx_simd_mul_d(x, two_over_pi);
2026     iy      = gmx_simd_cvt_d2i(z);
2027     y       = gmx_simd_round_d(z);
2028
2029     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
2030     ssign   = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
2031     csign   = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
2032 #else
2033     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
2034     const gmx_simd_double_t  minusquarter    = gmx_simd_set1_d(-0.25);
2035     gmx_simd_double_t        q;
2036     gmx_simd_dbool_t         m1, m2, m3;
2037
2038     /* The most obvious way to find the arguments quadrant in the unit circle
2039      * to calculate the sign is to use integer arithmetic, but that is not
2040      * present in all SIMD implementations. As an alternative, we have devised a
2041      * pure floating-point algorithm that uses truncation for argument reduction
2042      * so that we get a new value 0<=q<1 over the unit circle, and then
2043      * do floating-point comparisons with fractions. This is likely to be
2044      * slightly slower (~10%) due to the longer latencies of floating-point, so
2045      * we only use it when integer SIMD arithmetic is not present.
2046      */
2047     ssign   = x;
2048     x       = gmx_simd_fabs_d(x);
2049     /* It is critical that half-way cases are rounded down */
2050     z       = gmx_simd_fmadd_d(x, two_over_pi, half);
2051     y       = gmx_simd_trunc_d(z);
2052     q       = gmx_simd_mul_d(z, quarter);
2053     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2054     /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2055      * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2056      * This removes the 2*Pi periodicity without using any integer arithmetic.
2057      * First check if y had the value 2 or 3, set csign if true.
2058      */
2059     q       = gmx_simd_sub_d(q, half);
2060     /* If we have logical operations we can work directly on the signbit, which
2061      * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2062      * Thus, if you are altering defines to debug alternative code paths, the
2063      * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2064      * active or inactive - you will get errors if only one is used.
2065      */
2066 #    ifdef GMX_SIMD_HAVE_LOGICAL
2067     ssign   = gmx_simd_and_d(ssign, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2068     csign   = gmx_simd_andnot_d(q, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2069     ssign   = gmx_simd_xor_d(ssign, csign);
2070 #    else
2071     csign   = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
2072     ssign   = gmx_simd_xor_sign_d(ssign, csign);    /* swap ssign if csign was set. */
2073 #    endif
2074     /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
2075     m1      = gmx_simd_cmplt_d(q, minusquarter);
2076     m2      = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
2077     m3      = gmx_simd_cmplt_d(q, quarter);
2078     m2      = gmx_simd_and_db(m2, m3);
2079     mask    = gmx_simd_or_db(m1, m2);
2080     /* where mask is FALSE, set sign. */
2081     csign   = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
2082 #endif
2083     x       = gmx_simd_fnmadd_d(y, argred0, x);
2084     x       = gmx_simd_fnmadd_d(y, argred1, x);
2085     x       = gmx_simd_fnmadd_d(y, argred2, x);
2086     x       = gmx_simd_fnmadd_d(y, argred3, x);
2087     x2      = gmx_simd_mul_d(x, x);
2088
2089     psin    = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
2090     psin    = gmx_simd_fmadd_d(psin, x2, const_sin3);
2091     psin    = gmx_simd_fmadd_d(psin, x2, const_sin2);
2092     psin    = gmx_simd_fmadd_d(psin, x2, const_sin1);
2093     psin    = gmx_simd_fmadd_d(psin, x2, const_sin0);
2094     psin    = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
2095
2096     pcos    = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
2097     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos5);
2098     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos4);
2099     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos3);
2100     pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos2);
2101     pcos    = gmx_simd_fmsub_d(pcos, x2, half);
2102     pcos    = gmx_simd_fmadd_d(pcos, x2, one);
2103
2104     sss     = gmx_simd_blendv_d(pcos, psin, mask);
2105     ccc     = gmx_simd_blendv_d(psin, pcos, mask);
2106     /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
2107 #ifdef GMX_SIMD_HAVE_LOGICAL
2108     *sinval = gmx_simd_xor_d(sss, ssign);
2109     *cosval = gmx_simd_xor_d(ccc, csign);
2110 #else
2111     *sinval = gmx_simd_xor_sign_d(sss, ssign);
2112     *cosval = gmx_simd_xor_sign_d(ccc, csign);
2113 #endif
2114 }
2115
2116 /*! \brief SIMD double sin(x).
2117  *
2118  * \copydetails gmx_simd_sin_f
2119  */
2120 static gmx_inline gmx_simd_double_t gmx_simdcall
2121 gmx_simd_sin_d(gmx_simd_double_t x)
2122 {
2123     gmx_simd_double_t s, c;
2124     gmx_simd_sincos_d(x, &s, &c);
2125     return s;
2126 }
2127
2128 /*! \brief SIMD double cos(x).
2129  *
2130  * \copydetails gmx_simd_cos_f
2131  */
2132 static gmx_inline gmx_simd_double_t gmx_simdcall
2133 gmx_simd_cos_d(gmx_simd_double_t x)
2134 {
2135     gmx_simd_double_t s, c;
2136     gmx_simd_sincos_d(x, &s, &c);
2137     return c;
2138 }
2139
2140 /*! \brief SIMD double tan(x).
2141  *
2142  * \copydetails gmx_simd_tan_f
2143  */
2144 static gmx_inline gmx_simd_double_t gmx_simdcall
2145 gmx_simd_tan_d(gmx_simd_double_t x)
2146 {
2147     const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
2148     const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
2149     const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
2150     const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
2151     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
2152     const gmx_simd_double_t  CT15            = gmx_simd_set1_d(1.01419718511083373224408e-05);
2153     const gmx_simd_double_t  CT14            = gmx_simd_set1_d(-2.59519791585924697698614e-05);
2154     const gmx_simd_double_t  CT13            = gmx_simd_set1_d(5.23388081915899855325186e-05);
2155     const gmx_simd_double_t  CT12            = gmx_simd_set1_d(-3.05033014433946488225616e-05);
2156     const gmx_simd_double_t  CT11            = gmx_simd_set1_d(7.14707504084242744267497e-05);
2157     const gmx_simd_double_t  CT10            = gmx_simd_set1_d(8.09674518280159187045078e-05);
2158     const gmx_simd_double_t  CT9             = gmx_simd_set1_d(0.000244884931879331847054404);
2159     const gmx_simd_double_t  CT8             = gmx_simd_set1_d(0.000588505168743587154904506);
2160     const gmx_simd_double_t  CT7             = gmx_simd_set1_d(0.00145612788922812427978848);
2161     const gmx_simd_double_t  CT6             = gmx_simd_set1_d(0.00359208743836906619142924);
2162     const gmx_simd_double_t  CT5             = gmx_simd_set1_d(0.00886323944362401618113356);
2163     const gmx_simd_double_t  CT4             = gmx_simd_set1_d(0.0218694882853846389592078);
2164     const gmx_simd_double_t  CT3             = gmx_simd_set1_d(0.0539682539781298417636002);
2165     const gmx_simd_double_t  CT2             = gmx_simd_set1_d(0.133333333333125941821962);
2166     const gmx_simd_double_t  CT1             = gmx_simd_set1_d(0.333333333333334980164153);
2167
2168     gmx_simd_double_t        x2, p, y, z;
2169     gmx_simd_dbool_t         mask;
2170
2171 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2172     gmx_simd_dint32_t  iy;
2173     gmx_simd_dint32_t  ione = gmx_simd_set1_di(1);
2174
2175     z       = gmx_simd_mul_d(x, two_over_pi);
2176     iy      = gmx_simd_cvt_d2i(z);
2177     y       = gmx_simd_round_d(z);
2178     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
2179
2180     x       = gmx_simd_fnmadd_d(y, argred0, x);
2181     x       = gmx_simd_fnmadd_d(y, argred1, x);
2182     x       = gmx_simd_fnmadd_d(y, argred2, x);
2183     x       = gmx_simd_fnmadd_d(y, argred3, x);
2184     x       = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
2185 #else
2186     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
2187     const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
2188     const gmx_simd_double_t  threequarter    = gmx_simd_set1_d(0.75);
2189     gmx_simd_double_t        w, q;
2190     gmx_simd_dbool_t         m1, m2, m3;
2191
2192     w       = gmx_simd_fabs_d(x);
2193     z       = gmx_simd_fmadd_d(w, two_over_pi, half);
2194     y       = gmx_simd_trunc_d(z);
2195     q       = gmx_simd_mul_d(z, quarter);
2196     q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2197     m1      = gmx_simd_cmple_d(quarter, q);
2198     m2      = gmx_simd_cmplt_d(q, half);
2199     m3      = gmx_simd_cmple_d(threequarter, q);
2200     m1      = gmx_simd_and_db(m1, m2);
2201     mask    = gmx_simd_or_db(m1, m3);
2202     w       = gmx_simd_fnmadd_d(y, argred0, w);
2203     w       = gmx_simd_fnmadd_d(y, argred1, w);
2204     w       = gmx_simd_fnmadd_d(y, argred2, w);
2205     w       = gmx_simd_fnmadd_d(y, argred3, w);
2206
2207     w       = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
2208     x       = gmx_simd_xor_sign_d(w, x);
2209 #endif
2210     x2      = gmx_simd_mul_d(x, x);
2211     p       = gmx_simd_fmadd_d(CT15, x2, CT14);
2212     p       = gmx_simd_fmadd_d(p, x2, CT13);
2213     p       = gmx_simd_fmadd_d(p, x2, CT12);
2214     p       = gmx_simd_fmadd_d(p, x2, CT11);
2215     p       = gmx_simd_fmadd_d(p, x2, CT10);
2216     p       = gmx_simd_fmadd_d(p, x2, CT9);
2217     p       = gmx_simd_fmadd_d(p, x2, CT8);
2218     p       = gmx_simd_fmadd_d(p, x2, CT7);
2219     p       = gmx_simd_fmadd_d(p, x2, CT6);
2220     p       = gmx_simd_fmadd_d(p, x2, CT5);
2221     p       = gmx_simd_fmadd_d(p, x2, CT4);
2222     p       = gmx_simd_fmadd_d(p, x2, CT3);
2223     p       = gmx_simd_fmadd_d(p, x2, CT2);
2224     p       = gmx_simd_fmadd_d(p, x2, CT1);
2225     p       = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
2226
2227     p       = gmx_simd_blendv_d( p, gmx_simd_inv_d(p), mask);
2228     return p;
2229 }
2230
2231 /*! \brief SIMD double asin(x).
2232  *
2233  * \copydetails gmx_simd_asin_f
2234  */
2235 static gmx_inline gmx_simd_double_t gmx_simdcall
2236 gmx_simd_asin_d(gmx_simd_double_t x)
2237 {
2238     /* Same algorithm as cephes library */
2239     const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.625);
2240     const gmx_simd_double_t limit2    = gmx_simd_set1_d(1e-8);
2241     const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
2242     const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2243     const gmx_simd_double_t morebits  = gmx_simd_set1_d(6.123233995736765886130e-17);
2244
2245     const gmx_simd_double_t P5        = gmx_simd_set1_d(4.253011369004428248960e-3);
2246     const gmx_simd_double_t P4        = gmx_simd_set1_d(-6.019598008014123785661e-1);
2247     const gmx_simd_double_t P3        = gmx_simd_set1_d(5.444622390564711410273e0);
2248     const gmx_simd_double_t P2        = gmx_simd_set1_d(-1.626247967210700244449e1);
2249     const gmx_simd_double_t P1        = gmx_simd_set1_d(1.956261983317594739197e1);
2250     const gmx_simd_double_t P0        = gmx_simd_set1_d(-8.198089802484824371615e0);
2251
2252     const gmx_simd_double_t Q4        = gmx_simd_set1_d(-1.474091372988853791896e1);
2253     const gmx_simd_double_t Q3        = gmx_simd_set1_d(7.049610280856842141659e1);
2254     const gmx_simd_double_t Q2        = gmx_simd_set1_d(-1.471791292232726029859e2);
2255     const gmx_simd_double_t Q1        = gmx_simd_set1_d(1.395105614657485689735e2);
2256     const gmx_simd_double_t Q0        = gmx_simd_set1_d(-4.918853881490881290097e1);
2257
2258     const gmx_simd_double_t R4        = gmx_simd_set1_d(2.967721961301243206100e-3);
2259     const gmx_simd_double_t R3        = gmx_simd_set1_d(-5.634242780008963776856e-1);
2260     const gmx_simd_double_t R2        = gmx_simd_set1_d(6.968710824104713396794e0);
2261     const gmx_simd_double_t R1        = gmx_simd_set1_d(-2.556901049652824852289e1);
2262     const gmx_simd_double_t R0        = gmx_simd_set1_d(2.853665548261061424989e1);
2263
2264     const gmx_simd_double_t S3        = gmx_simd_set1_d(-2.194779531642920639778e1);
2265     const gmx_simd_double_t S2        = gmx_simd_set1_d(1.470656354026814941758e2);
2266     const gmx_simd_double_t S1        = gmx_simd_set1_d(-3.838770957603691357202e2);
2267     const gmx_simd_double_t S0        = gmx_simd_set1_d(3.424398657913078477438e2);
2268
2269     gmx_simd_double_t       xabs;
2270     gmx_simd_double_t       zz, ww, z, q, w, zz2, ww2;
2271     gmx_simd_double_t       PA, PB;
2272     gmx_simd_double_t       QA, QB;
2273     gmx_simd_double_t       RA, RB;
2274     gmx_simd_double_t       SA, SB;
2275     gmx_simd_double_t       nom, denom;
2276     gmx_simd_dbool_t        mask;
2277
2278     xabs  = gmx_simd_fabs_d(x);
2279
2280     mask  = gmx_simd_cmplt_d(limit1, xabs);
2281
2282     zz    = gmx_simd_sub_d(one, xabs);
2283     ww    = gmx_simd_mul_d(xabs, xabs);
2284     zz2   = gmx_simd_mul_d(zz, zz);
2285     ww2   = gmx_simd_mul_d(ww, ww);
2286
2287     /* R */
2288     RA    = gmx_simd_mul_d(R4, zz2);
2289     RB    = gmx_simd_mul_d(R3, zz2);
2290     RA    = gmx_simd_add_d(RA, R2);
2291     RB    = gmx_simd_add_d(RB, R1);
2292     RA    = gmx_simd_mul_d(RA, zz2);
2293     RB    = gmx_simd_mul_d(RB, zz);
2294     RA    = gmx_simd_add_d(RA, R0);
2295     RA    = gmx_simd_add_d(RA, RB);
2296
2297     /* S, SA = zz2 */
2298     SB    = gmx_simd_mul_d(S3, zz2);
2299     SA    = gmx_simd_add_d(zz2, S2);
2300     SB    = gmx_simd_add_d(SB, S1);
2301     SA    = gmx_simd_mul_d(SA, zz2);
2302     SB    = gmx_simd_mul_d(SB, zz);
2303     SA    = gmx_simd_add_d(SA, S0);
2304     SA    = gmx_simd_add_d(SA, SB);
2305
2306     /* P */
2307     PA    = gmx_simd_mul_d(P5, ww2);
2308     PB    = gmx_simd_mul_d(P4, ww2);
2309     PA    = gmx_simd_add_d(PA, P3);
2310     PB    = gmx_simd_add_d(PB, P2);
2311     PA    = gmx_simd_mul_d(PA, ww2);
2312     PB    = gmx_simd_mul_d(PB, ww2);
2313     PA    = gmx_simd_add_d(PA, P1);
2314     PB    = gmx_simd_add_d(PB, P0);
2315     PA    = gmx_simd_mul_d(PA, ww);
2316     PA    = gmx_simd_add_d(PA, PB);
2317
2318     /* Q, QA = ww2 */
2319     QB    = gmx_simd_mul_d(Q4, ww2);
2320     QA    = gmx_simd_add_d(ww2, Q3);
2321     QB    = gmx_simd_add_d(QB, Q2);
2322     QA    = gmx_simd_mul_d(QA, ww2);
2323     QB    = gmx_simd_mul_d(QB, ww2);
2324     QA    = gmx_simd_add_d(QA, Q1);
2325     QB    = gmx_simd_add_d(QB, Q0);
2326     QA    = gmx_simd_mul_d(QA, ww);
2327     QA    = gmx_simd_add_d(QA, QB);
2328
2329     RA    = gmx_simd_mul_d(RA, zz);
2330     PA    = gmx_simd_mul_d(PA, ww);
2331
2332     nom   = gmx_simd_blendv_d( PA, RA, mask );
2333     denom = gmx_simd_blendv_d( QA, SA, mask );
2334
2335     q     = gmx_simd_mul_d( nom, gmx_simd_inv_d(denom) );
2336
2337     zz    = gmx_simd_add_d(zz, zz);
2338     zz    = gmx_simd_sqrt_d(zz);
2339     z     = gmx_simd_sub_d(quarterpi, zz);
2340     zz    = gmx_simd_mul_d(zz, q);
2341     zz    = gmx_simd_sub_d(zz, morebits);
2342     z     = gmx_simd_sub_d(z, zz);
2343     z     = gmx_simd_add_d(z, quarterpi);
2344
2345     w     = gmx_simd_mul_d(xabs, q);
2346     w     = gmx_simd_add_d(w, xabs);
2347
2348     z     = gmx_simd_blendv_d( w, z, mask );
2349
2350     mask  = gmx_simd_cmplt_d(limit2, xabs);
2351     z     = gmx_simd_blendv_d( xabs, z, mask );
2352
2353     z = gmx_simd_xor_sign_d(z, x);
2354
2355     return z;
2356 }
2357
2358 /*! \brief SIMD double acos(x).
2359  *
2360  * \copydetails gmx_simd_acos_f
2361  */
2362 static gmx_inline gmx_simd_double_t gmx_simdcall
2363 gmx_simd_acos_d(gmx_simd_double_t x)
2364 {
2365     const gmx_simd_double_t one        = gmx_simd_set1_d(1.0);
2366     const gmx_simd_double_t half       = gmx_simd_set1_d(0.5);
2367     const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
2368     const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
2369
2370     gmx_simd_dbool_t        mask1;
2371     gmx_simd_double_t       z, z1, z2;
2372
2373     mask1 = gmx_simd_cmplt_d(half, x);
2374     z1    = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
2375     z1    = gmx_simd_sqrt_d(z1);
2376     z     = gmx_simd_blendv_d( x, z1, mask1 );
2377
2378     z     = gmx_simd_asin_d(z);
2379
2380     z1    = gmx_simd_add_d(z, z);
2381
2382     z2    = gmx_simd_sub_d(quarterpi0, z);
2383     z2    = gmx_simd_add_d(z2, quarterpi1);
2384     z2    = gmx_simd_add_d(z2, quarterpi0);
2385
2386     z     = gmx_simd_blendv_d(z2, z1, mask1);
2387
2388     return z;
2389 }
2390
2391 /*! \brief SIMD double atan(x).
2392  *
2393  * \copydetails gmx_simd_atan_f
2394  */
2395 static gmx_inline gmx_simd_double_t gmx_simdcall
2396 gmx_simd_atan_d(gmx_simd_double_t x)
2397 {
2398     /* Same algorithm as cephes library */
2399     const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.66);
2400     const gmx_simd_double_t limit2    = gmx_simd_set1_d(2.41421356237309504880);
2401     const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2402     const gmx_simd_double_t halfpi    = gmx_simd_set1_d(M_PI/2.0);
2403     const gmx_simd_double_t mone      = gmx_simd_set1_d(-1.0);
2404     const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
2405     const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
2406
2407     const gmx_simd_double_t P4        = gmx_simd_set1_d(-8.750608600031904122785E-1);
2408     const gmx_simd_double_t P3        = gmx_simd_set1_d(-1.615753718733365076637E1);
2409     const gmx_simd_double_t P2        = gmx_simd_set1_d(-7.500855792314704667340E1);
2410     const gmx_simd_double_t P1        = gmx_simd_set1_d(-1.228866684490136173410E2);
2411     const gmx_simd_double_t P0        = gmx_simd_set1_d(-6.485021904942025371773E1);
2412
2413     const gmx_simd_double_t Q4        = gmx_simd_set1_d(2.485846490142306297962E1);
2414     const gmx_simd_double_t Q3        = gmx_simd_set1_d(1.650270098316988542046E2);
2415     const gmx_simd_double_t Q2        = gmx_simd_set1_d(4.328810604912902668951E2);
2416     const gmx_simd_double_t Q1        = gmx_simd_set1_d(4.853903996359136964868E2);
2417     const gmx_simd_double_t Q0        = gmx_simd_set1_d(1.945506571482613964425E2);
2418
2419     gmx_simd_double_t       y, xabs, t1, t2;
2420     gmx_simd_double_t       z, z2;
2421     gmx_simd_double_t       P_A, P_B, Q_A, Q_B;
2422     gmx_simd_dbool_t        mask1, mask2;
2423
2424     xabs   = gmx_simd_fabs_d(x);
2425
2426     mask1  = gmx_simd_cmplt_d(limit1, xabs);
2427     mask2  = gmx_simd_cmplt_d(limit2, xabs);
2428
2429     t1     = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone), gmx_simd_inv_d(gmx_simd_sub_d(xabs, mone)));
2430     t2     = gmx_simd_mul_d(mone, gmx_simd_inv_d(xabs));
2431
2432     y      = gmx_simd_blendzero_d(quarterpi, mask1);
2433     y      = gmx_simd_blendv_d(y, halfpi, mask2);
2434     xabs   = gmx_simd_blendv_d(xabs, t1, mask1);
2435     xabs   = gmx_simd_blendv_d(xabs, t2, mask2);
2436
2437     z      = gmx_simd_mul_d(xabs, xabs);
2438     z2     = gmx_simd_mul_d(z, z);
2439
2440     P_A    = gmx_simd_mul_d(P4, z2);
2441     P_B    = gmx_simd_mul_d(P3, z2);
2442     P_A    = gmx_simd_add_d(P_A, P2);
2443     P_B    = gmx_simd_add_d(P_B, P1);
2444     P_A    = gmx_simd_mul_d(P_A, z2);
2445     P_B    = gmx_simd_mul_d(P_B, z);
2446     P_A    = gmx_simd_add_d(P_A, P0);
2447     P_A    = gmx_simd_add_d(P_A, P_B);
2448
2449     /* Q_A = z2 */
2450     Q_B    = gmx_simd_mul_d(Q4, z2);
2451     Q_A    = gmx_simd_add_d(z2, Q3);
2452     Q_B    = gmx_simd_add_d(Q_B, Q2);
2453     Q_A    = gmx_simd_mul_d(Q_A, z2);
2454     Q_B    = gmx_simd_mul_d(Q_B, z2);
2455     Q_A    = gmx_simd_add_d(Q_A, Q1);
2456     Q_B    = gmx_simd_add_d(Q_B, Q0);
2457     Q_A    = gmx_simd_mul_d(Q_A, z);
2458     Q_A    = gmx_simd_add_d(Q_A, Q_B);
2459
2460     z      = gmx_simd_mul_d(z, P_A);
2461     z      = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
2462     z      = gmx_simd_mul_d(z, xabs);
2463     z      = gmx_simd_add_d(z, xabs);
2464
2465     t1     = gmx_simd_blendzero_d(morebits1, mask1);
2466     t1     = gmx_simd_blendv_d(t1, morebits2, mask2);
2467
2468     z      = gmx_simd_add_d(z, t1);
2469     y      = gmx_simd_add_d(y, z);
2470
2471     y      = gmx_simd_xor_sign_d(y, x);
2472
2473     return y;
2474 }
2475
2476 /*! \brief SIMD double atan2(y,x).
2477  *
2478  * \copydetails gmx_simd_atan2_f
2479  */
2480 static gmx_inline gmx_simd_double_t gmx_simdcall
2481 gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
2482 {
2483     const gmx_simd_double_t pi          = gmx_simd_set1_d(M_PI);
2484     const gmx_simd_double_t halfpi      = gmx_simd_set1_d(M_PI/2.0);
2485     gmx_simd_double_t       xinv, p, aoffset;
2486     gmx_simd_dbool_t        mask_x0, mask_y0, mask_xlt0, mask_ylt0;
2487
2488     mask_x0   = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2489     mask_y0   = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
2490     mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2491     mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
2492
2493     aoffset   = gmx_simd_blendzero_d(halfpi, mask_x0);
2494     aoffset   = gmx_simd_blendnotzero_d(aoffset, mask_y0);
2495
2496     aoffset   = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
2497     aoffset   = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
2498
2499     xinv      = gmx_simd_blendnotzero_d(gmx_simd_inv_d(x), mask_x0);
2500     p         = gmx_simd_mul_d(y, xinv);
2501     p         = gmx_simd_atan_d(p);
2502     p         = gmx_simd_add_d(p, aoffset);
2503
2504     return p;
2505 }
2506
2507
2508 /*! \brief Calculate the force correction due to PME analytically for SIMD double.
2509  *
2510  * \copydetails gmx_simd_pmecorrF_f
2511  */
2512 static gmx_inline gmx_simd_double_t gmx_simdcall
2513 gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
2514 {
2515     const gmx_simd_double_t  FN10     = gmx_simd_set1_d(-8.0072854618360083154e-14);
2516     const gmx_simd_double_t  FN9      = gmx_simd_set1_d(1.1859116242260148027e-11);
2517     const gmx_simd_double_t  FN8      = gmx_simd_set1_d(-8.1490406329798423616e-10);
2518     const gmx_simd_double_t  FN7      = gmx_simd_set1_d(3.4404793543907847655e-8);
2519     const gmx_simd_double_t  FN6      = gmx_simd_set1_d(-9.9471420832602741006e-7);
2520     const gmx_simd_double_t  FN5      = gmx_simd_set1_d(0.000020740315999115847456);
2521     const gmx_simd_double_t  FN4      = gmx_simd_set1_d(-0.00031991745139313364005);
2522     const gmx_simd_double_t  FN3      = gmx_simd_set1_d(0.0035074449373659008203);
2523     const gmx_simd_double_t  FN2      = gmx_simd_set1_d(-0.031750380176100813405);
2524     const gmx_simd_double_t  FN1      = gmx_simd_set1_d(0.13884101728898463426);
2525     const gmx_simd_double_t  FN0      = gmx_simd_set1_d(-0.75225277815249618847);
2526
2527     const gmx_simd_double_t  FD5      = gmx_simd_set1_d(0.000016009278224355026701);
2528     const gmx_simd_double_t  FD4      = gmx_simd_set1_d(0.00051055686934806966046);
2529     const gmx_simd_double_t  FD3      = gmx_simd_set1_d(0.0081803507497974289008);
2530     const gmx_simd_double_t  FD2      = gmx_simd_set1_d(0.077181146026670287235);
2531     const gmx_simd_double_t  FD1      = gmx_simd_set1_d(0.41543303143712535988);
2532     const gmx_simd_double_t  FD0      = gmx_simd_set1_d(1.0);
2533
2534     gmx_simd_double_t        z4;
2535     gmx_simd_double_t        polyFN0, polyFN1, polyFD0, polyFD1;
2536
2537     z4             = gmx_simd_mul_d(z2, z2);
2538
2539     polyFD1        = gmx_simd_fmadd_d(FD5, z4, FD3);
2540     polyFD1        = gmx_simd_fmadd_d(polyFD1, z4, FD1);
2541     polyFD1        = gmx_simd_mul_d(polyFD1, z2);
2542     polyFD0        = gmx_simd_fmadd_d(FD4, z4, FD2);
2543     polyFD0        = gmx_simd_fmadd_d(polyFD0, z4, FD0);
2544     polyFD0        = gmx_simd_add_d(polyFD0, polyFD1);
2545
2546     polyFD0        = gmx_simd_inv_d(polyFD0);
2547
2548     polyFN0        = gmx_simd_fmadd_d(FN10, z4, FN8);
2549     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN6);
2550     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN4);
2551     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN2);
2552     polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN0);
2553     polyFN1        = gmx_simd_fmadd_d(FN9, z4, FN7);
2554     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN5);
2555     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN3);
2556     polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN1);
2557     polyFN0        = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
2558
2559
2560     return gmx_simd_mul_d(polyFN0, polyFD0);
2561 }
2562
2563
2564
2565 /*! \brief Calculate the potential correction due to PME analytically for SIMD double.
2566  *
2567  * \copydetails gmx_simd_pmecorrV_f
2568  */
2569 static gmx_inline gmx_simd_double_t gmx_simdcall
2570 gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
2571 {
2572     const gmx_simd_double_t  VN9      = gmx_simd_set1_d(-9.3723776169321855475e-13);
2573     const gmx_simd_double_t  VN8      = gmx_simd_set1_d(1.2280156762674215741e-10);
2574     const gmx_simd_double_t  VN7      = gmx_simd_set1_d(-7.3562157912251309487e-9);
2575     const gmx_simd_double_t  VN6      = gmx_simd_set1_d(2.6215886208032517509e-7);
2576     const gmx_simd_double_t  VN5      = gmx_simd_set1_d(-4.9532491651265819499e-6);
2577     const gmx_simd_double_t  VN4      = gmx_simd_set1_d(0.00025907400778966060389);
2578     const gmx_simd_double_t  VN3      = gmx_simd_set1_d(0.0010585044856156469792);
2579     const gmx_simd_double_t  VN2      = gmx_simd_set1_d(0.045247661136833092885);
2580     const gmx_simd_double_t  VN1      = gmx_simd_set1_d(0.11643931522926034421);
2581     const gmx_simd_double_t  VN0      = gmx_simd_set1_d(1.1283791671726767970);
2582
2583     const gmx_simd_double_t  VD5      = gmx_simd_set1_d(0.000021784709867336150342);
2584     const gmx_simd_double_t  VD4      = gmx_simd_set1_d(0.00064293662010911388448);
2585     const gmx_simd_double_t  VD3      = gmx_simd_set1_d(0.0096311444822588683504);
2586     const gmx_simd_double_t  VD2      = gmx_simd_set1_d(0.085608012351550627051);
2587     const gmx_simd_double_t  VD1      = gmx_simd_set1_d(0.43652499166614811084);
2588     const gmx_simd_double_t  VD0      = gmx_simd_set1_d(1.0);
2589
2590     gmx_simd_double_t        z4;
2591     gmx_simd_double_t        polyVN0, polyVN1, polyVD0, polyVD1;
2592
2593     z4             = gmx_simd_mul_d(z2, z2);
2594
2595     polyVD1        = gmx_simd_fmadd_d(VD5, z4, VD3);
2596     polyVD0        = gmx_simd_fmadd_d(VD4, z4, VD2);
2597     polyVD1        = gmx_simd_fmadd_d(polyVD1, z4, VD1);
2598     polyVD0        = gmx_simd_fmadd_d(polyVD0, z4, VD0);
2599     polyVD0        = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
2600
2601     polyVD0        = gmx_simd_inv_d(polyVD0);
2602
2603     polyVN1        = gmx_simd_fmadd_d(VN9, z4, VN7);
2604     polyVN0        = gmx_simd_fmadd_d(VN8, z4, VN6);
2605     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN5);
2606     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN4);
2607     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN3);
2608     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN2);
2609     polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN1);
2610     polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN0);
2611     polyVN0        = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
2612
2613     return gmx_simd_mul_d(polyVN0, polyVD0);
2614 }
2615
2616 /*! \} */
2617
2618 #endif
2619
2620
2621 /*! \name SIMD4 math functions
2622  *
2623  * \note Only a subset of the math functions are implemented for SIMD4.
2624  *  \{
2625  */
2626
2627
2628 #ifdef GMX_SIMD4_HAVE_FLOAT
2629
2630 /*************************************************************************
2631  * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
2632  *************************************************************************/
2633
2634 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
2635  *
2636  * \copydetails gmx_simd_sum4_f
2637  */
2638 static gmx_inline gmx_simd4_float_t gmx_simdcall
2639 gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
2640                  gmx_simd4_float_t c, gmx_simd4_float_t d)
2641 {
2642     return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
2643 }
2644
2645 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
2646  *
2647  * \copydetails gmx_simd_rsqrt_iter_f
2648  */
2649 static gmx_inline gmx_simd4_float_t gmx_simdcall
2650 gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
2651 {
2652 #    ifdef GMX_SIMD_HAVE_FMA
2653     return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
2654 #    else
2655     return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
2656 #    endif
2657 }
2658
2659 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
2660  *
2661  * \copydetails gmx_simd_invsqrt_f
2662  */
2663 static gmx_inline gmx_simd4_float_t gmx_simdcall
2664 gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
2665 {
2666     gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
2667 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2668     lu = gmx_simd4_rsqrt_iter_f(lu, x);
2669 #endif
2670 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2671     lu = gmx_simd4_rsqrt_iter_f(lu, x);
2672 #endif
2673 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2674     lu = gmx_simd4_rsqrt_iter_f(lu, x);
2675 #endif
2676     return lu;
2677 }
2678
2679 #endif /* GMX_SIMD4_HAVE_FLOAT */
2680
2681
2682
2683 #ifdef GMX_SIMD4_HAVE_DOUBLE
2684 /*************************************************************************
2685  * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
2686  *************************************************************************/
2687
2688
2689 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
2690  *
2691  * \copydetails gmx_simd_sum4_f
2692  */
2693 static gmx_inline gmx_simd4_double_t gmx_simdcall
2694 gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
2695                  gmx_simd4_double_t c, gmx_simd4_double_t d)
2696 {
2697     return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
2698 }
2699
2700 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
2701  *
2702  * \copydetails gmx_simd_rsqrt_iter_f
2703  */
2704 static gmx_inline gmx_simd4_double_t gmx_simdcall
2705 gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
2706 {
2707 #ifdef GMX_SIMD_HAVE_FMA
2708     return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
2709 #else
2710     return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
2711 #endif
2712 }
2713
2714 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
2715  *
2716  * \copydetails gmx_simd_invsqrt_f
2717  */
2718 static gmx_inline gmx_simd4_double_t gmx_simdcall
2719 gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
2720 {
2721     gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
2722 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2723     lu = gmx_simd4_rsqrt_iter_d(lu, x);
2724 #endif
2725 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2726     lu = gmx_simd4_rsqrt_iter_d(lu, x);
2727 #endif
2728 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2729     lu = gmx_simd4_rsqrt_iter_d(lu, x);
2730 #endif
2731 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2732     lu = gmx_simd4_rsqrt_iter_d(lu, x);
2733 #endif
2734     return lu;
2735 }
2736 #endif /* GMX_SIMD4_HAVE_DOUBLE */
2737
2738 /*! \} */
2739
2740
2741 /* Set defines based on default Gromacs precision */
2742 #ifdef GMX_DOUBLE
2743 /* Documentation in single branch below */
2744 #    define gmx_simd_sum4_r           gmx_simd_sum4_d
2745 #    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_d
2746 #    define gmx_simd_invsqrt_r        gmx_simd_invsqrt_d
2747 #    define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_d
2748 #    define gmx_simd_sqrt_r           gmx_simd_sqrt_d
2749 #    define gmx_simd_inv_r            gmx_simd_inv_d
2750 #    define gmx_simd_log_r            gmx_simd_log_d
2751 #    define gmx_simd_exp2_r           gmx_simd_exp2_d
2752 #    define gmx_simd_exp_r            gmx_simd_exp_d
2753 #    define gmx_simd_erf_r            gmx_simd_erf_d
2754 #    define gmx_simd_erfc_r           gmx_simd_erfc_d
2755 #    define gmx_simd_sincos_r         gmx_simd_sincos_d
2756 #    define gmx_simd_sin_r            gmx_simd_sin_d
2757 #    define gmx_simd_cos_r            gmx_simd_cos_d
2758 #    define gmx_simd_tan_r            gmx_simd_tan_d
2759 #    define gmx_simd_asin_r           gmx_simd_asin_d
2760 #    define gmx_simd_acos_r           gmx_simd_acos_d
2761 #    define gmx_simd_atan_r           gmx_simd_atan_d
2762 #    define gmx_simd_atan2_r          gmx_simd_atan2_d
2763 #    define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_d
2764 #    define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_d
2765 #    define gmx_simd4_sum4_r          gmx_simd4_sum4_d
2766 #    define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_d
2767
2768 #else /* GMX_DOUBLE */
2769
2770 /*! \name Real-precision SIMD math functions
2771  *
2772  *  These are the ones you should typically call in Gromacs.
2773  * \{
2774  */
2775
2776 /*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
2777  *
2778  * \copydetails gmx_simd_sum4_f
2779  */
2780 #    define gmx_simd_sum4_r           gmx_simd_sum4_f
2781
2782 /*! \brief Return -a if b is negative, SIMD real.
2783  *
2784  * \copydetails gmx_simd_xor_sign_f
2785  */
2786 #    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_f
2787
2788 /*! \brief Calculate 1/sqrt(x) for SIMD real.
2789  *
2790  * \copydetails gmx_simd_invsqrt_f
2791  */
2792 #    define gmx_simd_invsqrt_r        gmx_simd_invsqrt_f
2793
2794 /*! \brief Calculate 1/sqrt(x) for two SIMD reals.
2795  *
2796  * \copydetails gmx_simd_invsqrt_pair_f
2797  */
2798 #    define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_f
2799
2800 /*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
2801  *
2802  * \copydetails gmx_simd_sqrt_f
2803  */
2804 #    define gmx_simd_sqrt_r           gmx_simd_sqrt_f
2805
2806 /*! \brief Calculate 1/x for SIMD real.
2807  *
2808  * \copydetails gmx_simd_inv_f
2809  */
2810 #    define gmx_simd_inv_r            gmx_simd_inv_f
2811
2812 /*! \brief SIMD real log(x). This is the natural logarithm.
2813  *
2814  * \copydetails gmx_simd_log_f
2815  */
2816 #    define gmx_simd_log_r            gmx_simd_log_f
2817
2818 /*! \brief SIMD real 2^x.
2819  *
2820  * \copydetails gmx_simd_exp2_f
2821  */
2822 #    define gmx_simd_exp2_r           gmx_simd_exp2_f
2823
2824 /*! \brief SIMD real e^x.
2825  *
2826  * \copydetails gmx_simd_exp_f
2827  */
2828 #    define gmx_simd_exp_r            gmx_simd_exp_f
2829
2830 /*! \brief SIMD real erf(x).
2831  *
2832  * \copydetails gmx_simd_erf_f
2833  */
2834 #    define gmx_simd_erf_r            gmx_simd_erf_f
2835
2836 /*! \brief SIMD real erfc(x).
2837  *
2838  * \copydetails gmx_simd_erfc_f
2839  */
2840 #    define gmx_simd_erfc_r           gmx_simd_erfc_f
2841
2842 /*! \brief SIMD real sin \& cos.
2843  *
2844  * \copydetails gmx_simd_sincos_f
2845  */
2846 #    define gmx_simd_sincos_r         gmx_simd_sincos_f
2847
2848 /*! \brief SIMD real sin(x).
2849  *
2850  * \copydetails gmx_simd_sin_f
2851  */
2852 #    define gmx_simd_sin_r            gmx_simd_sin_f
2853
2854 /*! \brief SIMD real cos(x).
2855  *
2856  * \copydetails gmx_simd_cos_f
2857  */
2858 #    define gmx_simd_cos_r            gmx_simd_cos_f
2859
2860 /*! \brief SIMD real tan(x).
2861  *
2862  * \copydetails gmx_simd_tan_f
2863  */
2864 #    define gmx_simd_tan_r            gmx_simd_tan_f
2865
2866 /*! \brief SIMD real asin(x).
2867  *
2868  * \copydetails gmx_simd_asin_f
2869  */
2870 #    define gmx_simd_asin_r           gmx_simd_asin_f
2871
2872 /*! \brief SIMD real acos(x).
2873  *
2874  * \copydetails gmx_simd_acos_f
2875  */
2876 #    define gmx_simd_acos_r           gmx_simd_acos_f
2877
2878 /*! \brief SIMD real atan(x).
2879  *
2880  * \copydetails gmx_simd_atan_f
2881  */
2882 #    define gmx_simd_atan_r           gmx_simd_atan_f
2883
2884 /*! \brief SIMD real atan2(y,x).
2885  *
2886  * \copydetails gmx_simd_atan2_f
2887  */
2888 #    define gmx_simd_atan2_r          gmx_simd_atan2_f
2889
2890 /*! \brief SIMD Analytic PME force correction.
2891  *
2892  * \copydetails gmx_simd_pmecorrF_f
2893  */
2894 #    define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_f
2895
2896 /*! \brief SIMD Analytic PME potential correction.
2897  *
2898  * \copydetails gmx_simd_pmecorrV_f
2899  */
2900 #    define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_f
2901
2902 /*! \}
2903  * \name SIMD4 math functions
2904  * \{
2905  */
2906
2907 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
2908  *
2909  * \copydetails gmx_simd_sum4_f
2910  */
2911 #    define gmx_simd4_sum4_r          gmx_simd4_sum4_f
2912
2913 /*! \brief Calculate 1/sqrt(x) for SIMD4 real.
2914  *
2915  * \copydetails gmx_simd_invsqrt_f
2916  */
2917 #    define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_f
2918
2919 /*! \} */
2920
2921 #endif /* GMX_DOUBLE */
2922
2923 /*! \} */
2924 /*! \endcond */
2925
2926 #endif /* GMX_SIMD_SIMD_MATH_H_ */