2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H
36 #define GMX_SIMD_SIMD_MATH_H
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
62 #include "gromacs/math/utilities.h"
63 #include "gromacs/simd/simd.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/real.h"
73 /*! \addtogroup module_simd */
76 /*! \name Implementation accuracy settings
82 #if GMX_SIMD_HAVE_FLOAT
84 /*! \name Single precision SIMD math functions
86 * \note In most cases you should use the real-precision functions instead.
90 /****************************************
91 * SINGLE PRECISION SIMD MATH FUNCTIONS *
92 ****************************************/
94 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
95 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
97 * \param x Values to set sign for
98 * \param y Values used to set sign
99 * \return Magnitude of x, sign of y
101 static inline SimdFloat gmx_simdcall
102 copysign(SimdFloat x, SimdFloat y)
104 #if GMX_SIMD_HAVE_LOGICAL
105 return abs(x) | ( SimdFloat(GMX_FLOAT_NEGZERO) & y );
107 return blend(abs(x), -abs(x), y < setZero());
112 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
113 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
115 * This is a low-level routine that should only be used by SIMD math routine
116 * that evaluates the inverse square root.
118 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
119 * \param x The reference (starting) value x for which we want 1/sqrt(x).
120 * \return An improved approximation with roughly twice as many bits of accuracy.
122 static inline SimdFloat gmx_simdcall
123 rsqrtIter(SimdFloat lu, SimdFloat x)
125 SimdFloat tmp1 = x*lu;
126 SimdFloat tmp2 = SimdFloat(-0.5f)*lu;
127 tmp1 = fma(tmp1, lu, SimdFloat(-3.0f));
132 /*! \brief Calculate 1/sqrt(x) for SIMD float.
134 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
135 * GMX_FLOAT_MAX, i.e. within the range of single precision.
136 * For the single precision implementation this is obviously always
137 * true for positive values, but for double precision it adds an
138 * extra restriction since the first lookup step might have to be
139 * performed in single precision on some architectures. Note that the
140 * responsibility for checking falls on you - this routine does not
143 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
145 static inline SimdFloat gmx_simdcall
148 SimdFloat lu = rsqrt(x);
149 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
150 lu = rsqrtIter(lu, x);
152 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
153 lu = rsqrtIter(lu, x);
155 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
156 lu = rsqrtIter(lu, x);
161 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
163 * \param x0 First set of arguments, x0 must be in single range (see below).
164 * \param x1 Second set of arguments, x1 must be in single range (see below).
165 * \param[out] out0 Result 1/sqrt(x0)
166 * \param[out] out1 Result 1/sqrt(x1)
168 * In particular for double precision we can sometimes calculate square root
169 * pairs slightly faster by using single precision until the very last step.
171 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
172 * GMX_FLOAT_MAX, i.e. within the range of single precision.
173 * For the single precision implementation this is obviously always
174 * true for positive values, but for double precision it adds an
175 * extra restriction since the first lookup step might have to be
176 * performed in single precision on some architectures. Note that the
177 * responsibility for checking falls on you - this routine does not
180 static inline void gmx_simdcall
181 invsqrtPair(SimdFloat x0, SimdFloat x1,
182 SimdFloat *out0, SimdFloat *out1)
188 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
189 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
191 * This is a low-level routine that should only be used by SIMD math routine
192 * that evaluates the reciprocal.
194 * \param lu Approximation of 1/x, typically obtained from lookup.
195 * \param x The reference (starting) value x for which we want 1/x.
196 * \return An improved approximation with roughly twice as many bits of accuracy.
198 static inline SimdFloat gmx_simdcall
199 rcpIter(SimdFloat lu, SimdFloat x)
201 return lu*fnma(lu, x, SimdFloat(2.0f));
205 /*! \brief Calculate 1/x for SIMD float.
207 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
208 * GMX_FLOAT_MAX, i.e. within the range of single precision.
209 * For the single precision implementation this is obviously always
210 * true for positive values, but for double precision it adds an
211 * extra restriction since the first lookup step might have to be
212 * performed in single precision on some architectures. Note that the
213 * responsibility for checking falls on you - this routine does not
216 * \return 1/x. Result is undefined if your argument was invalid.
218 static inline SimdFloat gmx_simdcall
221 SimdFloat lu = rcp(x);
222 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
225 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
228 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
234 /*! \brief Division for SIMD floats
236 * \param nom Nominator
237 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
238 * For single precision this is equivalent to a nonzero argument,
239 * but in double precision it adds an extra restriction since
240 * the first lookup step might have to be performed in single
241 * precision on some architectures. Note that the responsibility
242 * for checking falls on you - this routine does not check arguments.
246 * \note This function does not use any masking to avoid problems with
247 * zero values in the denominator.
249 static inline SimdFloat gmx_simdcall
250 operator/(SimdFloat nom, SimdFloat denom)
252 return nom*inv(denom);
255 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
257 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
258 * Illegal values in the masked-out elements will not lead to
259 * floating-point exceptions.
261 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
262 * GMX_FLOAT_MAX for masked-in entries.
263 * See \ref invsqrt for the discussion about argument restrictions.
265 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
266 * entry was not masked, and 0.0 for masked-out entries.
268 static inline SimdFloat
269 maskzInvsqrt(SimdFloat x, SimdFBool m)
271 SimdFloat lu = maskzRsqrt(x, m);
272 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
273 lu = rsqrtIter(lu, x);
275 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
276 lu = rsqrtIter(lu, x);
278 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
279 lu = rsqrtIter(lu, x);
284 /*! \brief Calculate 1/x for SIMD float, masked version.
286 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
287 * GMX_FLOAT_MAX for masked-in entries.
288 * See \ref invsqrt for the discussion about argument restrictions.
290 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
292 static inline SimdFloat gmx_simdcall
293 maskzInv(SimdFloat x, SimdFBool m)
295 SimdFloat lu = maskzRcp(x, m);
296 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
299 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
302 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
308 /*! \brief Calculate sqrt(x) for SIMD floats
310 * \tparam opt By default, this function checks if the input value is 0.0
311 * and masks this to return the correct result. If you are certain
312 * your argument will never be zero, and you know you need to save
313 * every single cycle you can, you can alternatively call the
314 * function as sqrt<MathOptimization::Unsafe>(x).
316 * \param x Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
317 * lookup step often has to be implemented in single precision.
318 * Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
319 * result, even in double precision. If you are using the unsafe
320 * math optimization parameter, the argument must be in the range
321 * GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
323 * \return sqrt(x). The result is undefined if the input value does not fall
324 * in the allowed range specified for the argument.
326 template <MathOptimization opt = MathOptimization::Safe>
327 static inline SimdFloat gmx_simdcall
330 if (opt == MathOptimization::Safe)
332 SimdFloat res = maskzInvsqrt(x, setZero() < x);
337 return x * invsqrt(x);
341 #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
342 /*! \brief SIMD float log(x). This is the natural logarithm.
344 * \param x Argument, should be >0.
345 * \result The natural logarithm of x. Undefined if argument is invalid.
347 static inline SimdFloat gmx_simdcall
350 const SimdFloat one(1.0f);
351 const SimdFloat two(2.0f);
352 const SimdFloat invsqrt2(1.0f/std::sqrt(2.0f));
353 const SimdFloat corr(0.693147180559945286226764f);
354 const SimdFloat CL9(0.2371599674224853515625f);
355 const SimdFloat CL7(0.285279005765914916992188f);
356 const SimdFloat CL5(0.400005519390106201171875f);
357 const SimdFloat CL3(0.666666567325592041015625f);
358 const SimdFloat CL1(2.0f);
359 SimdFloat fExp, x2, p;
367 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
368 fExp = fExp - selectByMask(one, m);
369 x = x * blend(one, two, m);
371 x = (x-one) * inv( x+one );
374 p = fma(CL9, x2, CL7);
378 p = fma(p, x, corr*fExp);
384 #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
385 /*! \brief SIMD float 2^x
387 * \tparam opt If this is changed from the default (safe) into the unsafe
388 * option, input values that would otherwise lead to zero-clamped
389 * results are not allowed and will lead to undefined results.
391 * \param x Argument. For the default (safe) function version this can be
392 * arbitrarily small value, but the routine might clamp the result to
393 * zero for arguments that would produce subnormal IEEE754-2008 results.
394 * This corresponds to inputs below -126 in single or -1022 in double,
395 * and it might overflow for arguments reaching 127 (single) or
396 * 1023 (double). If you enable the unsafe math optimization,
397 * very small arguments will not necessarily be zero-clamped, but
398 * can produce undefined results.
400 * \result 2^x. The result is undefined for very large arguments that cause
401 * internal floating-point overflow. If unsafe optimizations are enabled,
402 * this is also true for very small values.
404 * \note The definition range of this function is just-so-slightly smaller
405 * than the allowed IEEE exponents for many architectures. This is due
406 * to the implementation, which will hopefully improve in the future.
408 * \warning You cannot rely on this implementation returning inf for arguments
409 * that cause overflow. If you have some very large
410 * values and need to rely on getting a valid numerical output,
411 * take the minimum of your variable and the largest valid argument
412 * before calling this routine.
414 template <MathOptimization opt = MathOptimization::Safe>
415 static inline SimdFloat gmx_simdcall
418 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
419 // Note that we use a slightly wider range inside this *implementation* compared
420 // to that guaranteed by the API documentation in the comment above (which has
421 // to cover all architectures).
422 const SimdFloat smallArgLimit(-127.0f);
423 const SimdFloat CC6(0.0001534581200287996416911311f);
424 const SimdFloat CC5(0.001339993121934088894618990f);
425 const SimdFloat CC4(0.009618488957115180159497841f);
426 const SimdFloat CC3(0.05550328776964726865751735f);
427 const SimdFloat CC2(0.2402264689063408646490722f);
428 const SimdFloat CC1(0.6931472057372680777553816f);
429 const SimdFloat one(1.0f);
435 // Large negative values are valid arguments to exp2(), so there are two
436 // things we need to account for:
437 // 1. When the exponents reaches -127, the (biased) exponent field will be
438 // zero and we can no longer multiply with it. There are special IEEE
439 // formats to handle this range, but for now we have to accept that
440 // we cannot handle those arguments. If input value becomes even more
441 // negative, it will start to loop and we would end up with invalid
442 // exponents. Thus, we need to limit or mask this.
443 // 2. For VERY large negative values, we will have problems that the
444 // subtraction to get the fractional part loses accuracy, and then we
445 // can end up with overflows in the polynomial.
447 // For now, we handle this by clamping smaller arguments to -127. At this
448 // point we will already return zero, so we don't need to do anything
449 // extra for the exponent.
451 if (opt == MathOptimization::Safe)
453 x = max(x, smallArgLimit);
456 fexppart = ldexp(one, cvtR2I(x));
460 p = fma(CC6, x, CC5);
471 #if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
472 /*! \brief SIMD float exp(x).
474 * In addition to scaling the argument for 2^x this routine correctly does
475 * extended precision arithmetics to improve accuracy.
477 * \tparam opt If this is changed from the default (safe) into the unsafe
478 * option, input values that would otherwise lead to zero-clamped
479 * results are not allowed and will lead to undefined results.
481 * \param x Argument. For the default (safe) function version this can be
482 * arbitrarily small value, but the routine might clamp the result to
483 * zero for arguments that would produce subnormal IEEE754-2008 results.
484 * This corresponds to input arguments reaching
485 * -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
486 * Similarly, it might overflow for arguments reaching
487 * 127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
488 * unsafe math optimizations are enabled, small input values that would
489 * result in zero-clamped output are not allowed.
491 * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
492 * depending on the underlying implementation. If unsafe optimizations
493 * are enabled, this is also true for very small values.
495 * \note The definition range of this function is just-so-slightly smaller
496 * than the allowed IEEE exponents for many architectures. This is due
497 * to the implementation, which will hopefully improve in the future.
499 * \warning You cannot rely on this implementation returning inf for arguments
500 * that cause overflow. If you have some very large
501 * values and need to rely on getting a valid numerical output,
502 * take the minimum of your variable and the largest valid argument
503 * before calling this routine.
505 template <MathOptimization opt = MathOptimization::Safe>
506 static inline SimdFloat gmx_simdcall
509 const SimdFloat argscale(1.44269504088896341f);
510 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
511 // Note that we use a slightly wider range inside this *implementation* compared
512 // to that guaranteed by the API documentation in the comment above (which has
513 // to cover all architectures).
514 const SimdFloat smallArgLimit(-88.0296919311f);
515 const SimdFloat invargscale0(-0.693145751953125f);
516 const SimdFloat invargscale1(-1.428606765330187045e-06f);
517 const SimdFloat CC4(0.00136324646882712841033936f);
518 const SimdFloat CC3(0.00836596917361021041870117f);
519 const SimdFloat CC2(0.0416710823774337768554688f);
520 const SimdFloat CC1(0.166665524244308471679688f);
521 const SimdFloat CC0(0.499999850988388061523438f);
522 const SimdFloat one(1.0f);
527 // Large negative values are valid arguments to exp2(), so there are two
528 // things we need to account for:
529 // 1. When the IEEE exponents reaches -127, the (biased) exponent field will be
530 // zero and we can no longer multiply with it. There are special IEEE
531 // formats to handle this range, but for now we have to accept that
532 // we cannot handle those arguments. If input value becomes even more
533 // negative, it will start to loop and we would end up with invalid
534 // exponents. Thus, we need to limit or mask this.
535 // 2. For VERY large negative values, we will have problems that the
536 // subtraction to get the fractional part loses accuracy, and then we
537 // can end up with overflows in the polynomial.
539 // For now, we handle this by clamping smaller arguments to -127*ln(2). At this
540 // point we will already return zero, so we don't need to do anything
541 // extra for the exponent.
543 if (opt == MathOptimization::Safe)
545 x = max(x, smallArgLimit);
549 fexppart = ldexp(one, cvtR2I(y));
552 // Extended precision arithmetics
553 x = fma(invargscale0, intpart, x);
554 x = fma(invargscale1, intpart, x);
556 p = fma(CC4, x, CC3);
561 x = fma(p, fexppart, fexppart);
566 /*! \brief SIMD float erf(x).
568 * \param x The value to calculate erf(x) for.
571 * This routine achieves very close to full precision, but we do not care about
572 * the last bit or the subnormal result range.
574 static inline SimdFloat gmx_simdcall
577 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
578 const SimdFloat CA6(7.853861353153693e-5f);
579 const SimdFloat CA5(-8.010193625184903e-4f);
580 const SimdFloat CA4(5.188327685732524e-3f);
581 const SimdFloat CA3(-2.685381193529856e-2f);
582 const SimdFloat CA2(1.128358514861418e-1f);
583 const SimdFloat CA1(-3.761262582423300e-1f);
584 const SimdFloat CA0(1.128379165726710f);
585 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
586 const SimdFloat CB9(-0.0018629930017603923f);
587 const SimdFloat CB8(0.003909821287598495f);
588 const SimdFloat CB7(-0.0052094582210355615f);
589 const SimdFloat CB6(0.005685614362160572f);
590 const SimdFloat CB5(-0.0025367682853477272f);
591 const SimdFloat CB4(-0.010199799682318782f);
592 const SimdFloat CB3(0.04369575504816542f);
593 const SimdFloat CB2(-0.11884063474674492f);
594 const SimdFloat CB1(0.2732120154030589f);
595 const SimdFloat CB0(0.42758357702025784f);
596 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
597 const SimdFloat CC10(-0.0445555913112064f);
598 const SimdFloat CC9(0.21376355144663348f);
599 const SimdFloat CC8(-0.3473187200259257f);
600 const SimdFloat CC7(0.016690861551248114f);
601 const SimdFloat CC6(0.7560973182491192f);
602 const SimdFloat CC5(-1.2137903600145787f);
603 const SimdFloat CC4(0.8411872321232948f);
604 const SimdFloat CC3(-0.08670413896296343f);
605 const SimdFloat CC2(-0.27124782687240334f);
606 const SimdFloat CC1(-0.0007502488047806069f);
607 const SimdFloat CC0(0.5642114853803148f);
608 const SimdFloat one(1.0f);
609 const SimdFloat two(2.0f);
612 SimdFloat t, t2, w, w2;
613 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
615 SimdFloat res_erf, res_erfc, res;
616 SimdFBool m, maskErf;
622 pA0 = fma(CA6, x4, CA4);
623 pA1 = fma(CA5, x4, CA3);
624 pA0 = fma(pA0, x4, CA2);
625 pA1 = fma(pA1, x4, CA1);
627 pA0 = fma(pA1, x2, pA0);
628 // Constant term must come last for precision reasons
635 maskErf = SimdFloat(0.75f) <= y;
636 t = maskzInv(y, maskErf);
641 // No need for a floating-point sieve here (as in erfc), since erf()
642 // will never return values that are extremely small for large args.
643 expmx2 = exp( -y*y );
645 pB1 = fma(CB9, w2, CB7);
646 pB0 = fma(CB8, w2, CB6);
647 pB1 = fma(pB1, w2, CB5);
648 pB0 = fma(pB0, w2, CB4);
649 pB1 = fma(pB1, w2, CB3);
650 pB0 = fma(pB0, w2, CB2);
651 pB1 = fma(pB1, w2, CB1);
652 pB0 = fma(pB0, w2, CB0);
653 pB0 = fma(pB1, w, pB0);
655 pC0 = fma(CC10, t2, CC8);
656 pC1 = fma(CC9, t2, CC7);
657 pC0 = fma(pC0, t2, CC6);
658 pC1 = fma(pC1, t2, CC5);
659 pC0 = fma(pC0, t2, CC4);
660 pC1 = fma(pC1, t2, CC3);
661 pC0 = fma(pC0, t2, CC2);
662 pC1 = fma(pC1, t2, CC1);
664 pC0 = fma(pC0, t2, CC0);
665 pC0 = fma(pC1, t, pC0);
668 // Select pB0 or pC0 for erfc()
670 res_erfc = blend(pB0, pC0, m);
671 res_erfc = res_erfc * expmx2;
673 // erfc(x<0) = 2-erfc(|x|)
675 res_erfc = blend(res_erfc, two-res_erfc, m);
677 // Select erf() or erfc()
678 res = blend(res_erf, one-res_erfc, maskErf);
683 /*! \brief SIMD float erfc(x).
685 * \param x The value to calculate erfc(x) for.
688 * This routine achieves full precision (bar the last bit) over most of the
689 * input range, but for large arguments where the result is getting close
690 * to the minimum representable numbers we accept slightly larger errors
691 * (think results that are in the ballpark of 10^-30 for single precision)
692 * since that is not relevant for MD.
694 static inline SimdFloat gmx_simdcall
697 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
698 const SimdFloat CA6(7.853861353153693e-5f);
699 const SimdFloat CA5(-8.010193625184903e-4f);
700 const SimdFloat CA4(5.188327685732524e-3f);
701 const SimdFloat CA3(-2.685381193529856e-2f);
702 const SimdFloat CA2(1.128358514861418e-1f);
703 const SimdFloat CA1(-3.761262582423300e-1f);
704 const SimdFloat CA0(1.128379165726710f);
705 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
706 const SimdFloat CB9(-0.0018629930017603923f);
707 const SimdFloat CB8(0.003909821287598495f);
708 const SimdFloat CB7(-0.0052094582210355615f);
709 const SimdFloat CB6(0.005685614362160572f);
710 const SimdFloat CB5(-0.0025367682853477272f);
711 const SimdFloat CB4(-0.010199799682318782f);
712 const SimdFloat CB3(0.04369575504816542f);
713 const SimdFloat CB2(-0.11884063474674492f);
714 const SimdFloat CB1(0.2732120154030589f);
715 const SimdFloat CB0(0.42758357702025784f);
716 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
717 const SimdFloat CC10(-0.0445555913112064f);
718 const SimdFloat CC9(0.21376355144663348f);
719 const SimdFloat CC8(-0.3473187200259257f);
720 const SimdFloat CC7(0.016690861551248114f);
721 const SimdFloat CC6(0.7560973182491192f);
722 const SimdFloat CC5(-1.2137903600145787f);
723 const SimdFloat CC4(0.8411872321232948f);
724 const SimdFloat CC3(-0.08670413896296343f);
725 const SimdFloat CC2(-0.27124782687240334f);
726 const SimdFloat CC1(-0.0007502488047806069f);
727 const SimdFloat CC0(0.5642114853803148f);
728 // Coefficients for expansion of exp(x) in [0,0.1]
729 // CD0 and CD1 are both 1.0, so no need to declare them separately
730 const SimdFloat CD2(0.5000066608081202f);
731 const SimdFloat CD3(0.1664795422874624f);
732 const SimdFloat CD4(0.04379839977652482f);
733 const SimdFloat one(1.0f);
734 const SimdFloat two(2.0f);
736 /* We need to use a small trick here, since we cannot assume all SIMD
737 * architectures support integers, and the flag we want (0xfffff000) would
738 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
739 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
740 * fp numbers, and perform a logical or. Since the expression is constant,
741 * we can at least hope it is evaluated at compile-time.
743 #if GMX_SIMD_HAVE_LOGICAL
744 const SimdFloat sieve(SimdFloat(-5.965323564e+29f) | SimdFloat(7.05044434e-30f));
746 const int isieve = 0xFFFFF000;
747 GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) mem[GMX_SIMD_FLOAT_WIDTH];
756 SimdFloat q, z, t, t2, w, w2;
757 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
758 SimdFloat expmx2, corr;
759 SimdFloat res_erf, res_erfc, res;
760 SimdFBool m, msk_erf;
766 pA0 = fma(CA6, x4, CA4);
767 pA1 = fma(CA5, x4, CA3);
768 pA0 = fma(pA0, x4, CA2);
769 pA1 = fma(pA1, x4, CA1);
771 pA0 = fma(pA0, x4, pA1);
772 // Constant term must come last for precision reasons
779 msk_erf = SimdFloat(0.75f) <= y;
780 t = maskzInv(y, msk_erf);
785 * We cannot simply calculate exp(-y2) directly in single precision, since
786 * that will lose a couple of bits of precision due to the multiplication.
787 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
788 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
790 * The only drawback with this is that it requires TWO separate exponential
791 * evaluations, which would be horrible performance-wise. However, the argument
792 * for the second exp() call is always small, so there we simply use a
793 * low-order minimax expansion on [0,0.1].
795 * However, this neat idea requires support for logical ops (and) on
796 * FP numbers, which some vendors decided isn't necessary in their SIMD
797 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
798 * in double, but we still need memory as a backup when that is not available,
799 * and this case is rare enough that we go directly there...
801 #if GMX_SIMD_HAVE_LOGICAL
805 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
808 conv.i = conv.i & isieve;
814 corr = fma(CD4, q, CD3);
815 corr = fma(corr, q, CD2);
816 corr = fma(corr, q, one);
817 corr = fma(corr, q, one);
819 expmx2 = exp( -z*z );
820 expmx2 = expmx2 * corr;
822 pB1 = fma(CB9, w2, CB7);
823 pB0 = fma(CB8, w2, CB6);
824 pB1 = fma(pB1, w2, CB5);
825 pB0 = fma(pB0, w2, CB4);
826 pB1 = fma(pB1, w2, CB3);
827 pB0 = fma(pB0, w2, CB2);
828 pB1 = fma(pB1, w2, CB1);
829 pB0 = fma(pB0, w2, CB0);
830 pB0 = fma(pB1, w, pB0);
832 pC0 = fma(CC10, t2, CC8);
833 pC1 = fma(CC9, t2, CC7);
834 pC0 = fma(pC0, t2, CC6);
835 pC1 = fma(pC1, t2, CC5);
836 pC0 = fma(pC0, t2, CC4);
837 pC1 = fma(pC1, t2, CC3);
838 pC0 = fma(pC0, t2, CC2);
839 pC1 = fma(pC1, t2, CC1);
841 pC0 = fma(pC0, t2, CC0);
842 pC0 = fma(pC1, t, pC0);
845 // Select pB0 or pC0 for erfc()
847 res_erfc = blend(pB0, pC0, m);
848 res_erfc = res_erfc * expmx2;
850 // erfc(x<0) = 2-erfc(|x|)
852 res_erfc = blend(res_erfc, two-res_erfc, m);
854 // Select erf() or erfc()
855 res = blend(one-res_erf, res_erfc, msk_erf);
860 /*! \brief SIMD float sin \& cos.
862 * \param x The argument to evaluate sin/cos for
863 * \param[out] sinval Sin(x)
864 * \param[out] cosval Cos(x)
866 * This version achieves close to machine precision, but for very large
867 * magnitudes of the argument we inherently begin to lose accuracy due to the
868 * argument reduction, despite using extended precision arithmetics internally.
870 static inline void gmx_simdcall
871 sincos(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
873 // Constants to subtract Pi/4*x from y while minimizing precision loss
874 const SimdFloat argred0(-1.5703125);
875 const SimdFloat argred1(-4.83751296997070312500e-04f);
876 const SimdFloat argred2(-7.54953362047672271729e-08f);
877 const SimdFloat argred3(-2.56334406825708960298e-12f);
878 const SimdFloat two_over_pi(static_cast<float>(2.0f/M_PI));
879 const SimdFloat const_sin2(-1.9515295891e-4f);
880 const SimdFloat const_sin1( 8.3321608736e-3f);
881 const SimdFloat const_sin0(-1.6666654611e-1f);
882 const SimdFloat const_cos2( 2.443315711809948e-5f);
883 const SimdFloat const_cos1(-1.388731625493765e-3f);
884 const SimdFloat const_cos0( 4.166664568298827e-2f);
885 const SimdFloat half(0.5f);
886 const SimdFloat one(1.0f);
887 SimdFloat ssign, csign;
888 SimdFloat x2, y, z, psin, pcos, sss, ccc;
891 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
892 const SimdFInt32 ione(1);
893 const SimdFInt32 itwo(2);
900 m = cvtIB2B((iy & ione) == SimdFInt32(0));
901 ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
902 csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy+ione) & itwo) == itwo));
904 const SimdFloat quarter(0.25f);
905 const SimdFloat minusquarter(-0.25f);
907 SimdFBool m1, m2, m3;
909 /* The most obvious way to find the arguments quadrant in the unit circle
910 * to calculate the sign is to use integer arithmetic, but that is not
911 * present in all SIMD implementations. As an alternative, we have devised a
912 * pure floating-point algorithm that uses truncation for argument reduction
913 * so that we get a new value 0<=q<1 over the unit circle, and then
914 * do floating-point comparisons with fractions. This is likely to be
915 * slightly slower (~10%) due to the longer latencies of floating-point, so
916 * we only use it when integer SIMD arithmetic is not present.
920 // It is critical that half-way cases are rounded down
921 z = fma(x, two_over_pi, half);
925 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
926 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
927 * This removes the 2*Pi periodicity without using any integer arithmetic.
928 * First check if y had the value 2 or 3, set csign if true.
931 /* If we have logical operations we can work directly on the signbit, which
932 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
933 * Thus, if you are altering defines to debug alternative code paths, the
934 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
935 * active or inactive - you will get errors if only one is used.
937 # if GMX_SIMD_HAVE_LOGICAL
938 ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
939 csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
940 ssign = ssign ^ csign;
942 ssign = copysign(SimdFloat(1.0f), ssign);
943 csign = copysign(SimdFloat(1.0f), q);
945 ssign = ssign * csign; // swap ssign if csign was set.
947 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
948 m1 = (q < minusquarter);
949 m2 = (setZero() <= q);
953 // where mask is FALSE, swap sign.
954 csign = csign * blend(SimdFloat(-1.0f), one, m);
956 x = fma(y, argred0, x);
957 x = fma(y, argred1, x);
958 x = fma(y, argred2, x);
959 x = fma(y, argred3, x);
962 psin = fma(const_sin2, x2, const_sin1);
963 psin = fma(psin, x2, const_sin0);
964 psin = fma(psin, x * x2, x);
965 pcos = fma(const_cos2, x2, const_cos1);
966 pcos = fma(pcos, x2, const_cos0);
967 pcos = fms(pcos, x2, half);
968 pcos = fma(pcos, x2, one);
970 sss = blend(pcos, psin, m);
971 ccc = blend(psin, pcos, m);
972 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
973 #if GMX_SIMD_HAVE_LOGICAL
974 *sinval = sss ^ ssign;
975 *cosval = ccc ^ csign;
977 *sinval = sss * ssign;
978 *cosval = ccc * csign;
982 /*! \brief SIMD float sin(x).
984 * \param x The argument to evaluate sin for
987 * \attention Do NOT call both sin & cos if you need both results, since each of them
988 * will then call \ref sincos and waste a factor 2 in performance.
990 static inline SimdFloat gmx_simdcall
998 /*! \brief SIMD float cos(x).
1000 * \param x The argument to evaluate cos for
1003 * \attention Do NOT call both sin & cos if you need both results, since each of them
1004 * will then call \ref sincos and waste a factor 2 in performance.
1006 static inline SimdFloat gmx_simdcall
1014 /*! \brief SIMD float tan(x).
1016 * \param x The argument to evaluate tan for
1019 static inline SimdFloat gmx_simdcall
1022 const SimdFloat argred0(-1.5703125);
1023 const SimdFloat argred1(-4.83751296997070312500e-04f);
1024 const SimdFloat argred2(-7.54953362047672271729e-08f);
1025 const SimdFloat argred3(-2.56334406825708960298e-12f);
1026 const SimdFloat two_over_pi(static_cast<float>(2.0f/M_PI));
1027 const SimdFloat CT6(0.009498288995810566122993911);
1028 const SimdFloat CT5(0.002895755790837379295226923);
1029 const SimdFloat CT4(0.02460087336161924491836265);
1030 const SimdFloat CT3(0.05334912882656359828045988);
1031 const SimdFloat CT2(0.1333989091464957704418495);
1032 const SimdFloat CT1(0.3333307599244198227797507);
1034 SimdFloat x2, p, y, z;
1037 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1041 z = x * two_over_pi;
1044 m = cvtIB2B((iy & ione) == ione);
1046 x = fma(y, argred0, x);
1047 x = fma(y, argred1, x);
1048 x = fma(y, argred2, x);
1049 x = fma(y, argred3, x);
1050 x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1052 const SimdFloat quarter(0.25f);
1053 const SimdFloat half(0.5f);
1054 const SimdFloat threequarter(0.75f);
1056 SimdFBool m1, m2, m3;
1059 z = fma(w, two_over_pi, half);
1065 m3 = threequarter <= q;
1068 w = fma(y, argred0, w);
1069 w = fma(y, argred1, w);
1070 w = fma(y, argred2, w);
1071 w = fma(y, argred3, w);
1072 w = blend(w, -w, m);
1073 x = w * copysign( SimdFloat(1.0), x );
1076 p = fma(CT6, x2, CT5);
1077 p = fma(p, x2, CT4);
1078 p = fma(p, x2, CT3);
1079 p = fma(p, x2, CT2);
1080 p = fma(p, x2, CT1);
1081 p = fma(x2 * p, x, x);
1083 p = blend( p, maskzInv(p, m), m);
1087 /*! \brief SIMD float asin(x).
1089 * \param x The argument to evaluate asin for
1092 static inline SimdFloat gmx_simdcall
1095 const SimdFloat limitlow(1e-4f);
1096 const SimdFloat half(0.5f);
1097 const SimdFloat one(1.0f);
1098 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
1099 const SimdFloat CC5(4.2163199048E-2f);
1100 const SimdFloat CC4(2.4181311049E-2f);
1101 const SimdFloat CC3(4.5470025998E-2f);
1102 const SimdFloat CC2(7.4953002686E-2f);
1103 const SimdFloat CC1(1.6666752422E-1f);
1105 SimdFloat z, z1, z2, q, q1, q2;
1111 z1 = half * (one-xabs);
1113 q1 = z1 * maskzInvsqrt(z1, m2);
1116 z = blend(z2, z1, m);
1117 q = blend(q2, q1, m);
1120 pA = fma(CC5, z2, CC3);
1121 pB = fma(CC4, z2, CC2);
1122 pA = fma(pA, z2, CC1);
1124 z = fma(pB, z2, pA);
1128 z = blend(z, q2, m);
1130 m = limitlow < xabs;
1131 z = blend( xabs, z, m );
1137 /*! \brief SIMD float acos(x).
1139 * \param x The argument to evaluate acos for
1142 static inline SimdFloat gmx_simdcall
1145 const SimdFloat one(1.0f);
1146 const SimdFloat half(0.5f);
1147 const SimdFloat pi(static_cast<float>(M_PI));
1148 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
1150 SimdFloat z, z1, z2, z3;
1151 SimdFBool m1, m2, m3;
1157 z = fnma(half, xabs, half);
1159 z = z * maskzInvsqrt(z, m3);
1160 z = blend(x, z, m1);
1166 z = blend(z1, z2, m2);
1167 z = blend(z3, z, m1);
1172 /*! \brief SIMD float asin(x).
1174 * \param x The argument to evaluate atan for
1175 * \result Atan(x), same argument/value range as standard math library.
1177 static inline SimdFloat gmx_simdcall
1180 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
1181 const SimdFloat CA17(0.002823638962581753730774f);
1182 const SimdFloat CA15(-0.01595690287649631500244f);
1183 const SimdFloat CA13(0.04250498861074447631836f);
1184 const SimdFloat CA11(-0.07489009201526641845703f);
1185 const SimdFloat CA9 (0.1063479334115982055664f);
1186 const SimdFloat CA7 (-0.1420273631811141967773f);
1187 const SimdFloat CA5 (0.1999269574880599975585f);
1188 const SimdFloat CA3 (-0.3333310186862945556640f);
1189 const SimdFloat one (1.0f);
1190 SimdFloat x2, x3, x4, pA, pB;
1196 x = blend(x, maskzInv(x, m2), m2);
1201 pA = fma(CA17, x4, CA13);
1202 pB = fma(CA15, x4, CA11);
1203 pA = fma(pA, x4, CA9);
1204 pB = fma(pB, x4, CA7);
1205 pA = fma(pA, x4, CA5);
1206 pB = fma(pB, x4, CA3);
1207 pA = fma(pA, x2, pB);
1208 pA = fma(pA, x3, x);
1210 pA = blend(pA, halfpi-pA, m2);
1211 pA = blend(pA, -pA, m);
1216 /*! \brief SIMD float atan2(y,x).
1218 * \param y Y component of vector, any quartile
1219 * \param x X component of vector, any quartile
1220 * \result Atan(y,x), same argument/value range as standard math library.
1222 * \note This routine should provide correct results for all finite
1223 * non-zero or positive-zero arguments. However, negative zero arguments will
1224 * be treated as positive zero, which means the return value will deviate from
1225 * the standard math library atan2(y,x) for those cases. That should not be
1226 * of any concern in Gromacs, and in particular it will not affect calculations
1227 * of angles from vectors.
1229 static inline SimdFloat gmx_simdcall
1230 atan2(SimdFloat y, SimdFloat x)
1232 const SimdFloat pi(static_cast<float>(M_PI));
1233 const SimdFloat halfpi(static_cast<float>(M_PI/2.0));
1234 SimdFloat xinv, p, aoffset;
1235 SimdFBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1237 mask_xnz = x != setZero();
1238 mask_ynz = y != setZero();
1239 mask_xlt0 = x < setZero();
1240 mask_ylt0 = y < setZero();
1242 aoffset = selectByNotMask(halfpi, mask_xnz);
1243 aoffset = selectByMask(aoffset, mask_ynz);
1245 aoffset = blend(aoffset, pi, mask_xlt0);
1246 aoffset = blend(aoffset, -aoffset, mask_ylt0);
1248 xinv = maskzInv(x, mask_xnz);
1256 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1258 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1259 * \result Correction factor to coulomb force - see below for details.
1261 * This routine is meant to enable analytical evaluation of the
1262 * direct-space PME electrostatic force to avoid tables.
1264 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1265 * are some problems evaluating that:
1267 * First, the error function is difficult (read: expensive) to
1268 * approxmiate accurately for intermediate to large arguments, and
1269 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1270 * Second, we now try to avoid calculating potentials in Gromacs but
1271 * use forces directly.
1273 * We can simply things slight by noting that the PME part is really
1274 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1276 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1278 * The first term we already have from the inverse square root, so
1279 * that we can leave out of this routine.
1281 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1282 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1283 * the range used for the minimax fit. Use your favorite plotting program
1284 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1286 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1287 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1288 * then only use even powers. This is another minor optimization, since
1289 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1290 * the vector between the two atoms to get the vectorial force. The
1291 * fastest flops are the ones we can avoid calculating!
1293 * So, here's how it should be used:
1295 * 1. Calculate \f$r^2\f$.
1296 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1297 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1298 * 4. The return value is the expression:
1301 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1304 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1307 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1310 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1313 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1316 * With a bit of math exercise you should be able to confirm that
1320 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1323 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1324 * and you have your force (divided by \f$r\f$). A final multiplication
1325 * with the vector connecting the two particles and you have your
1326 * vectorial force to add to the particles.
1328 * This approximation achieves an error slightly lower than 1e-6
1329 * in single precision and 1e-11 in double precision
1330 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1331 * when added to \f$1/r\f$ the error will be insignificant.
1332 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1335 static inline SimdFloat gmx_simdcall
1336 pmeForceCorrection(SimdFloat z2)
1338 const SimdFloat FN6(-1.7357322914161492954e-8f);
1339 const SimdFloat FN5(1.4703624142580877519e-6f);
1340 const SimdFloat FN4(-0.000053401640219807709149f);
1341 const SimdFloat FN3(0.0010054721316683106153f);
1342 const SimdFloat FN2(-0.019278317264888380590f);
1343 const SimdFloat FN1(0.069670166153766424023f);
1344 const SimdFloat FN0(-0.75225204789749321333f);
1346 const SimdFloat FD4(0.0011193462567257629232f);
1347 const SimdFloat FD3(0.014866955030185295499f);
1348 const SimdFloat FD2(0.11583842382862377919f);
1349 const SimdFloat FD1(0.50736591960530292870f);
1350 const SimdFloat FD0(1.0f);
1353 SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1357 polyFD0 = fma(FD4, z4, FD2);
1358 polyFD1 = fma(FD3, z4, FD1);
1359 polyFD0 = fma(polyFD0, z4, FD0);
1360 polyFD0 = fma(polyFD1, z2, polyFD0);
1362 polyFD0 = inv(polyFD0);
1364 polyFN0 = fma(FN6, z4, FN4);
1365 polyFN1 = fma(FN5, z4, FN3);
1366 polyFN0 = fma(polyFN0, z4, FN2);
1367 polyFN1 = fma(polyFN1, z4, FN1);
1368 polyFN0 = fma(polyFN0, z4, FN0);
1369 polyFN0 = fma(polyFN1, z2, polyFN0);
1371 return polyFN0 * polyFD0;
1376 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1378 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1379 * \result Correction factor to coulomb potential - see below for details.
1381 * See \ref pmeForceCorrection for details about the approximation.
1383 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1384 * as the input argument.
1386 * Here's how it should be used:
1388 * 1. Calculate \f$r^2\f$.
1389 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1390 * 3. Evaluate this routine with z^2 as the argument.
1391 * 4. The return value is the expression:
1394 * \frac{\mbox{erf}(z)}{z}
1397 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1400 * \frac{\mbox{erf}(r \beta)}{r}
1403 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1404 * and you have your potential.
1406 * This approximation achieves an error slightly lower than 1e-6
1407 * in single precision and 4e-11 in double precision
1408 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1409 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1410 * when added to \f$1/r\f$ the error will be insignificant.
1411 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1413 static inline SimdFloat gmx_simdcall
1414 pmePotentialCorrection(SimdFloat z2)
1416 const SimdFloat VN6(1.9296833005951166339e-8f);
1417 const SimdFloat VN5(-1.4213390571557850962e-6f);
1418 const SimdFloat VN4(0.000041603292906656984871f);
1419 const SimdFloat VN3(-0.00013134036773265025626f);
1420 const SimdFloat VN2(0.038657983986041781264f);
1421 const SimdFloat VN1(0.11285044772717598220f);
1422 const SimdFloat VN0(1.1283802385263030286f);
1424 const SimdFloat VD3(0.0066752224023576045451f);
1425 const SimdFloat VD2(0.078647795836373922256f);
1426 const SimdFloat VD1(0.43336185284710920150f);
1427 const SimdFloat VD0(1.0f);
1430 SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1434 polyVD1 = fma(VD3, z4, VD1);
1435 polyVD0 = fma(VD2, z4, VD0);
1436 polyVD0 = fma(polyVD1, z2, polyVD0);
1438 polyVD0 = inv(polyVD0);
1440 polyVN0 = fma(VN6, z4, VN4);
1441 polyVN1 = fma(VN5, z4, VN3);
1442 polyVN0 = fma(polyVN0, z4, VN2);
1443 polyVN1 = fma(polyVN1, z4, VN1);
1444 polyVN0 = fma(polyVN0, z4, VN0);
1445 polyVN0 = fma(polyVN1, z2, polyVN0);
1447 return polyVN0 * polyVD0;
1453 #if GMX_SIMD_HAVE_DOUBLE
1456 /*! \name Double precision SIMD math functions
1458 * \note In most cases you should use the real-precision functions instead.
1462 /****************************************
1463 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1464 ****************************************/
1466 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1467 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1469 * \param x Values to set sign for
1470 * \param y Values used to set sign
1471 * \return Magnitude of x, sign of y
1473 static inline SimdDouble gmx_simdcall
1474 copysign(SimdDouble x, SimdDouble y)
1476 #if GMX_SIMD_HAVE_LOGICAL
1477 return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1479 return blend(abs(x), -abs(x), (y < setZero()));
1484 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1485 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1487 * This is a low-level routine that should only be used by SIMD math routine
1488 * that evaluates the inverse square root.
1490 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1491 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1492 * \return An improved approximation with roughly twice as many bits of accuracy.
1494 static inline SimdDouble gmx_simdcall
1495 rsqrtIter(SimdDouble lu, SimdDouble x)
1497 SimdDouble tmp1 = x*lu;
1498 SimdDouble tmp2 = SimdDouble(-0.5)*lu;
1499 tmp1 = fma(tmp1, lu, SimdDouble(-3.0));
1504 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1506 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1507 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1508 * For the single precision implementation this is obviously always
1509 * true for positive values, but for double precision it adds an
1510 * extra restriction since the first lookup step might have to be
1511 * performed in single precision on some architectures. Note that the
1512 * responsibility for checking falls on you - this routine does not
1515 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1517 static inline SimdDouble gmx_simdcall
1518 invsqrt(SimdDouble x)
1520 SimdDouble lu = rsqrt(x);
1521 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1522 lu = rsqrtIter(lu, x);
1524 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1525 lu = rsqrtIter(lu, x);
1527 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1528 lu = rsqrtIter(lu, x);
1530 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1531 lu = rsqrtIter(lu, x);
1536 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1538 * \param x0 First set of arguments, x0 must be in single range (see below).
1539 * \param x1 Second set of arguments, x1 must be in single range (see below).
1540 * \param[out] out0 Result 1/sqrt(x0)
1541 * \param[out] out1 Result 1/sqrt(x1)
1543 * In particular for double precision we can sometimes calculate square root
1544 * pairs slightly faster by using single precision until the very last step.
1546 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1547 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1548 * For the single precision implementation this is obviously always
1549 * true for positive values, but for double precision it adds an
1550 * extra restriction since the first lookup step might have to be
1551 * performed in single precision on some architectures. Note that the
1552 * responsibility for checking falls on you - this routine does not
1555 static inline void gmx_simdcall
1556 invsqrtPair(SimdDouble x0, SimdDouble x1,
1557 SimdDouble *out0, SimdDouble *out1)
1559 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1560 SimdFloat xf = cvtDD2F(x0, x1);
1561 SimdFloat luf = rsqrt(xf);
1562 SimdDouble lu0, lu1;
1563 // Intermediate target is single - mantissa+1 bits
1564 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1565 luf = rsqrtIter(luf, xf);
1567 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1568 luf = rsqrtIter(luf, xf);
1570 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1571 luf = rsqrtIter(luf, xf);
1573 cvtF2DD(luf, &lu0, &lu1);
1574 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1575 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1576 lu0 = rsqrtIter(lu0, x0);
1577 lu1 = rsqrtIter(lu1, x1);
1579 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1580 lu0 = rsqrtIter(lu0, x0);
1581 lu1 = rsqrtIter(lu1, x1);
1586 *out0 = invsqrt(x0);
1587 *out1 = invsqrt(x1);
1591 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1592 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1594 * This is a low-level routine that should only be used by SIMD math routine
1595 * that evaluates the reciprocal.
1597 * \param lu Approximation of 1/x, typically obtained from lookup.
1598 * \param x The reference (starting) value x for which we want 1/x.
1599 * \return An improved approximation with roughly twice as many bits of accuracy.
1601 static inline SimdDouble gmx_simdcall
1602 rcpIter(SimdDouble lu, SimdDouble x)
1604 return lu*fnma(lu, x, SimdDouble(2.0));
1608 /*! \brief Calculate 1/x for SIMD double.
1610 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1611 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1612 * For the single precision implementation this is obviously always
1613 * true for positive values, but for double precision it adds an
1614 * extra restriction since the first lookup step might have to be
1615 * performed in single precision on some architectures. Note that the
1616 * responsibility for checking falls on you - this routine does not
1619 * \return 1/x. Result is undefined if your argument was invalid.
1621 static inline SimdDouble gmx_simdcall
1624 SimdDouble lu = rcp(x);
1625 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1626 lu = rcpIter(lu, x);
1628 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1629 lu = rcpIter(lu, x);
1631 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1632 lu = rcpIter(lu, x);
1634 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1635 lu = rcpIter(lu, x);
1640 /*! \brief Division for SIMD doubles
1642 * \param nom Nominator
1643 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1644 * For single precision this is equivalent to a nonzero argument,
1645 * but in double precision it adds an extra restriction since
1646 * the first lookup step might have to be performed in single
1647 * precision on some architectures. Note that the responsibility
1648 * for checking falls on you - this routine does not check arguments.
1652 * \note This function does not use any masking to avoid problems with
1653 * zero values in the denominator.
1655 static inline SimdDouble gmx_simdcall
1656 operator/(SimdDouble nom, SimdDouble denom)
1658 return nom*inv(denom);
1662 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1664 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1665 * Illegal values in the masked-out elements will not lead to
1666 * floating-point exceptions.
1668 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1669 * GMX_FLOAT_MAX for masked-in entries.
1670 * See \ref invsqrt for the discussion about argument restrictions.
1672 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1673 * entry was not masked, and 0.0 for masked-out entries.
1675 static inline SimdDouble
1676 maskzInvsqrt(SimdDouble x, SimdDBool m)
1678 SimdDouble lu = maskzRsqrt(x, m);
1679 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1680 lu = rsqrtIter(lu, x);
1682 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1683 lu = rsqrtIter(lu, x);
1685 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1686 lu = rsqrtIter(lu, x);
1688 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1689 lu = rsqrtIter(lu, x);
1694 /*! \brief Calculate 1/x for SIMD double, masked version.
1696 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1697 * GMX_FLOAT_MAX for masked-in entries.
1698 * See \ref invsqrt for the discussion about argument restrictions.
1700 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1702 static inline SimdDouble gmx_simdcall
1703 maskzInv(SimdDouble x, SimdDBool m)
1705 SimdDouble lu = maskzRcp(x, m);
1706 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1707 lu = rcpIter(lu, x);
1709 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1710 lu = rcpIter(lu, x);
1712 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1713 lu = rcpIter(lu, x);
1715 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1716 lu = rcpIter(lu, x);
1722 /*! \brief Calculate sqrt(x) for SIMD doubles.
1724 * \copydetails sqrt(SimdFloat)
1726 template <MathOptimization opt = MathOptimization::Safe>
1727 static inline SimdDouble gmx_simdcall
1730 if (opt == MathOptimization::Safe)
1732 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1737 return x * invsqrt(x);
1741 #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
1742 /*! \brief SIMD double log(x). This is the natural logarithm.
1744 * \param x Argument, should be >0.
1745 * \result The natural logarithm of x. Undefined if argument is invalid.
1747 static inline SimdDouble gmx_simdcall
1750 const SimdDouble one(1.0);
1751 const SimdDouble two(2.0);
1752 const SimdDouble invsqrt2(1.0/std::sqrt(2.0));
1753 const SimdDouble corr(0.693147180559945286226764);
1754 const SimdDouble CL15(0.148197055177935105296783);
1755 const SimdDouble CL13(0.153108178020442575739679);
1756 const SimdDouble CL11(0.181837339521549679055568);
1757 const SimdDouble CL9(0.22222194152736701733275);
1758 const SimdDouble CL7(0.285714288030134544449368);
1759 const SimdDouble CL5(0.399999999989941956712869);
1760 const SimdDouble CL3(0.666666666666685503450651);
1761 const SimdDouble CL1(2.0);
1762 SimdDouble fExp, x2, p;
1766 x = frexp(x, &iExp);
1767 fExp = cvtI2R(iExp);
1770 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
1771 fExp = fExp - selectByMask(one, m);
1772 x = x * blend(one, two, m);
1774 x = (x-one) * inv( x+one );
1777 p = fma(CL15, x2, CL13);
1778 p = fma(p, x2, CL11);
1779 p = fma(p, x2, CL9);
1780 p = fma(p, x2, CL7);
1781 p = fma(p, x2, CL5);
1782 p = fma(p, x2, CL3);
1783 p = fma(p, x2, CL1);
1784 p = fma(p, x, corr * fExp);
1790 #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
1791 /*! \brief SIMD double 2^x.
1793 * \copydetails exp2(SimdFloat)
1795 template <MathOptimization opt = MathOptimization::Safe>
1796 static inline SimdDouble gmx_simdcall
1799 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
1800 const SimdDouble smallArgLimit(-1023.0);
1801 const SimdDouble CE11(4.435280790452730022081181e-10);
1802 const SimdDouble CE10(7.074105630863314448024247e-09);
1803 const SimdDouble CE9(1.017819803432096698472621e-07);
1804 const SimdDouble CE8(1.321543308956718799557863e-06);
1805 const SimdDouble CE7(0.00001525273348995851746990884);
1806 const SimdDouble CE6(0.0001540353046251466849082632);
1807 const SimdDouble CE5(0.001333355814678995257307880);
1808 const SimdDouble CE4(0.009618129107588335039176502);
1809 const SimdDouble CE3(0.05550410866481992147457793);
1810 const SimdDouble CE2(0.2402265069591015620470894);
1811 const SimdDouble CE1(0.6931471805599453304615075);
1812 const SimdDouble one(1.0);
1815 SimdDouble fexppart;
1818 // Large negative values are valid arguments to exp2(), so there are two
1819 // things we need to account for:
1820 // 1. When the exponents reaches -1023, the (biased) exponent field will be
1821 // zero and we can no longer multiply with it. There are special IEEE
1822 // formats to handle this range, but for now we have to accept that
1823 // we cannot handle those arguments. If input value becomes even more
1824 // negative, it will start to loop and we would end up with invalid
1825 // exponents. Thus, we need to limit or mask this.
1826 // 2. For VERY large negative values, we will have problems that the
1827 // subtraction to get the fractional part loses accuracy, and then we
1828 // can end up with overflows in the polynomial.
1830 // For now, we handle this by clamping smaller arguments to -1023. At this
1831 // point we will already return zero, so we don't need to do anything
1832 // extra for the exponent.
1834 if (opt == MathOptimization::Safe)
1836 x = max(x, smallArgLimit);
1839 fexppart = ldexp(one, cvtR2I(x));
1843 p = fma(CE11, x, CE10);
1859 #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
1860 /*! \brief SIMD double exp(x).
1862 * \copydetails exp(SimdFloat)
1864 template <MathOptimization opt = MathOptimization::Safe>
1865 static inline SimdDouble gmx_simdcall
1868 const SimdDouble argscale(1.44269504088896340735992468100);
1869 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
1870 const SimdDouble smallArgLimit(-709.0895657128);
1871 const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
1872 const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
1873 const SimdDouble CE12(2.078375306791423699350304e-09);
1874 const SimdDouble CE11(2.518173854179933105218635e-08);
1875 const SimdDouble CE10(2.755842049600488770111608e-07);
1876 const SimdDouble CE9(2.755691815216689746619849e-06);
1877 const SimdDouble CE8(2.480158383706245033920920e-05);
1878 const SimdDouble CE7(0.0001984127043518048611841321);
1879 const SimdDouble CE6(0.001388888889360258341755930);
1880 const SimdDouble CE5(0.008333333332907368102819109);
1881 const SimdDouble CE4(0.04166666666663836745814631);
1882 const SimdDouble CE3(0.1666666666666796929434570);
1883 const SimdDouble CE2(0.5);
1884 const SimdDouble one(1.0);
1885 SimdDouble fexppart;
1889 // Large negative values are valid arguments to exp2(), so there are two
1890 // things we need to account for:
1891 // 1. When the exponents reaches -1023, the (biased) exponent field will be
1892 // zero and we can no longer multiply with it. There are special IEEE
1893 // formats to handle this range, but for now we have to accept that
1894 // we cannot handle those arguments. If input value becomes even more
1895 // negative, it will start to loop and we would end up with invalid
1896 // exponents. Thus, we need to limit or mask this.
1897 // 2. For VERY large negative values, we will have problems that the
1898 // subtraction to get the fractional part loses accuracy, and then we
1899 // can end up with overflows in the polynomial.
1901 // For now, we handle this by clamping smaller arguments to -1023*ln(2). At this
1902 // point we will already return zero, so we don't need to do anything
1903 // extra for the exponent.
1905 if (opt == MathOptimization::Safe)
1907 x = max(x, smallArgLimit);
1911 fexppart = ldexp(one, cvtR2I(y));
1914 // Extended precision arithmetics
1915 x = fma(invargscale0, intpart, x);
1916 x = fma(invargscale1, intpart, x);
1918 p = fma(CE12, x, CE11);
1919 p = fma(p, x, CE10);
1928 p = fma(p, x * x, x);
1929 x = fma(p, fexppart, fexppart);
1935 /*! \brief SIMD double erf(x).
1937 * \param x The value to calculate erf(x) for.
1940 * This routine achieves very close to full precision, but we do not care about
1941 * the last bit or the subnormal result range.
1943 static inline SimdDouble gmx_simdcall
1946 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
1947 const SimdDouble CAP4(-0.431780540597889301512e-4);
1948 const SimdDouble CAP3(-0.00578562306260059236059);
1949 const SimdDouble CAP2(-0.028593586920219752446);
1950 const SimdDouble CAP1(-0.315924962948621698209);
1951 const SimdDouble CAP0(0.14952975608477029151);
1953 const SimdDouble CAQ5(-0.374089300177174709737e-5);
1954 const SimdDouble CAQ4(0.00015126584532155383535);
1955 const SimdDouble CAQ3(0.00536692680669480725423);
1956 const SimdDouble CAQ2(0.0668686825594046122636);
1957 const SimdDouble CAQ1(0.402604990869284362773);
1959 const SimdDouble CAoffset(0.9788494110107421875);
1961 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
1962 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
1963 const SimdDouble CBP5(0.00119770193298159629350136085658);
1964 const SimdDouble CBP4(0.0164944422378370965881008942733);
1965 const SimdDouble CBP3(0.0984581468691775932063932439252);
1966 const SimdDouble CBP2(0.317364595806937763843589437418);
1967 const SimdDouble CBP1(0.554167062641455850932670067075);
1968 const SimdDouble CBP0(0.427583576155807163756925301060);
1969 const SimdDouble CBQ7(0.00212288829699830145976198384930);
1970 const SimdDouble CBQ6(0.0334810979522685300554606393425);
1971 const SimdDouble CBQ5(0.2361713785181450957579508850717);
1972 const SimdDouble CBQ4(0.955364736493055670530981883072);
1973 const SimdDouble CBQ3(2.36815675631420037315349279199);
1974 const SimdDouble CBQ2(3.55261649184083035537184223542);
1975 const SimdDouble CBQ1(2.93501136050160872574376997993);
1978 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
1979 const SimdDouble CCP6(-2.8175401114513378771);
1980 const SimdDouble CCP5(-3.22729451764143718517);
1981 const SimdDouble CCP4(-2.5518551727311523996);
1982 const SimdDouble CCP3(-0.687717681153649930619);
1983 const SimdDouble CCP2(-0.212652252872804219852);
1984 const SimdDouble CCP1(0.0175389834052493308818);
1985 const SimdDouble CCP0(0.00628057170626964891937);
1987 const SimdDouble CCQ6(5.48409182238641741584);
1988 const SimdDouble CCQ5(13.5064170191802889145);
1989 const SimdDouble CCQ4(22.9367376522880577224);
1990 const SimdDouble CCQ3(15.930646027911794143);
1991 const SimdDouble CCQ2(11.0567237927800161565);
1992 const SimdDouble CCQ1(2.79257750980575282228);
1994 const SimdDouble CCoffset(0.5579090118408203125);
1996 const SimdDouble one(1.0);
1997 const SimdDouble two(2.0);
1999 SimdDouble xabs, x2, x4, t, t2, w, w2;
2000 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2001 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2002 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2003 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2005 SimdDBool mask, mask_erf, notmask_erf;
2009 mask_erf = (xabs < one);
2010 notmask_erf = (one <= xabs);
2014 PolyAP0 = fma(CAP4, x4, CAP2);
2015 PolyAP1 = fma(CAP3, x4, CAP1);
2016 PolyAP0 = fma(PolyAP0, x4, CAP0);
2017 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2019 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2020 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2021 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2022 PolyAQ0 = fma(PolyAQ0, x4, one);
2023 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2025 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
2026 res_erf = CAoffset + res_erf;
2027 res_erf = x * res_erf;
2029 // Calculate erfc() in range [1,4.5]
2033 PolyBP0 = fma(CBP6, t2, CBP4);
2034 PolyBP1 = fma(CBP5, t2, CBP3);
2035 PolyBP0 = fma(PolyBP0, t2, CBP2);
2036 PolyBP1 = fma(PolyBP1, t2, CBP1);
2037 PolyBP0 = fma(PolyBP0, t2, CBP0);
2038 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2040 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2041 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2042 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2043 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2044 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2045 PolyBQ0 = fma(PolyBQ0, t2, one);
2046 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2048 // The denominator polynomial can be zero outside the range
2049 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
2051 res_erfcB = res_erfcB * xabs;
2053 // Calculate erfc() in range [4.5,inf]
2054 w = maskzInv(xabs, notmask_erf);
2057 PolyCP0 = fma(CCP6, w2, CCP4);
2058 PolyCP1 = fma(CCP5, w2, CCP3);
2059 PolyCP0 = fma(PolyCP0, w2, CCP2);
2060 PolyCP1 = fma(PolyCP1, w2, CCP1);
2061 PolyCP0 = fma(PolyCP0, w2, CCP0);
2062 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2064 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2065 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2066 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2067 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2068 PolyCQ0 = fma(PolyCQ0, w2, one);
2069 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2071 expmx2 = exp( -x2 );
2073 // The denominator polynomial can be zero outside the range
2074 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
2075 res_erfcC = res_erfcC + CCoffset;
2076 res_erfcC = res_erfcC * w;
2078 mask = (SimdDouble(4.5) < xabs);
2079 res_erfc = blend(res_erfcB, res_erfcC, mask);
2081 res_erfc = res_erfc * expmx2;
2083 // erfc(x<0) = 2-erfc(|x|)
2084 mask = (x < setZero());
2085 res_erfc = blend(res_erfc, two - res_erfc, mask);
2087 // Select erf() or erfc()
2088 res = blend(one - res_erfc, res_erf, mask_erf);
2093 /*! \brief SIMD double erfc(x).
2095 * \param x The value to calculate erfc(x) for.
2098 * This routine achieves full precision (bar the last bit) over most of the
2099 * input range, but for large arguments where the result is getting close
2100 * to the minimum representable numbers we accept slightly larger errors
2101 * (think results that are in the ballpark of 10^-200 for double)
2102 * since that is not relevant for MD.
2104 static inline SimdDouble gmx_simdcall
2107 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2108 const SimdDouble CAP4(-0.431780540597889301512e-4);
2109 const SimdDouble CAP3(-0.00578562306260059236059);
2110 const SimdDouble CAP2(-0.028593586920219752446);
2111 const SimdDouble CAP1(-0.315924962948621698209);
2112 const SimdDouble CAP0(0.14952975608477029151);
2114 const SimdDouble CAQ5(-0.374089300177174709737e-5);
2115 const SimdDouble CAQ4(0.00015126584532155383535);
2116 const SimdDouble CAQ3(0.00536692680669480725423);
2117 const SimdDouble CAQ2(0.0668686825594046122636);
2118 const SimdDouble CAQ1(0.402604990869284362773);
2120 const SimdDouble CAoffset(0.9788494110107421875);
2122 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2123 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2124 const SimdDouble CBP5(0.00119770193298159629350136085658);
2125 const SimdDouble CBP4(0.0164944422378370965881008942733);
2126 const SimdDouble CBP3(0.0984581468691775932063932439252);
2127 const SimdDouble CBP2(0.317364595806937763843589437418);
2128 const SimdDouble CBP1(0.554167062641455850932670067075);
2129 const SimdDouble CBP0(0.427583576155807163756925301060);
2130 const SimdDouble CBQ7(0.00212288829699830145976198384930);
2131 const SimdDouble CBQ6(0.0334810979522685300554606393425);
2132 const SimdDouble CBQ5(0.2361713785181450957579508850717);
2133 const SimdDouble CBQ4(0.955364736493055670530981883072);
2134 const SimdDouble CBQ3(2.36815675631420037315349279199);
2135 const SimdDouble CBQ2(3.55261649184083035537184223542);
2136 const SimdDouble CBQ1(2.93501136050160872574376997993);
2139 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2140 const SimdDouble CCP6(-2.8175401114513378771);
2141 const SimdDouble CCP5(-3.22729451764143718517);
2142 const SimdDouble CCP4(-2.5518551727311523996);
2143 const SimdDouble CCP3(-0.687717681153649930619);
2144 const SimdDouble CCP2(-0.212652252872804219852);
2145 const SimdDouble CCP1(0.0175389834052493308818);
2146 const SimdDouble CCP0(0.00628057170626964891937);
2148 const SimdDouble CCQ6(5.48409182238641741584);
2149 const SimdDouble CCQ5(13.5064170191802889145);
2150 const SimdDouble CCQ4(22.9367376522880577224);
2151 const SimdDouble CCQ3(15.930646027911794143);
2152 const SimdDouble CCQ2(11.0567237927800161565);
2153 const SimdDouble CCQ1(2.79257750980575282228);
2155 const SimdDouble CCoffset(0.5579090118408203125);
2157 const SimdDouble one(1.0);
2158 const SimdDouble two(2.0);
2160 SimdDouble xabs, x2, x4, t, t2, w, w2;
2161 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2162 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2163 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2164 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2166 SimdDBool mask, mask_erf, notmask_erf;
2170 mask_erf = (xabs < one);
2171 notmask_erf = (one <= xabs);
2175 PolyAP0 = fma(CAP4, x4, CAP2);
2176 PolyAP1 = fma(CAP3, x4, CAP1);
2177 PolyAP0 = fma(PolyAP0, x4, CAP0);
2178 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2179 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2180 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2181 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2182 PolyAQ0 = fma(PolyAQ0, x4, one);
2183 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2185 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
2186 res_erf = CAoffset + res_erf;
2187 res_erf = x * res_erf;
2189 // Calculate erfc() in range [1,4.5]
2193 PolyBP0 = fma(CBP6, t2, CBP4);
2194 PolyBP1 = fma(CBP5, t2, CBP3);
2195 PolyBP0 = fma(PolyBP0, t2, CBP2);
2196 PolyBP1 = fma(PolyBP1, t2, CBP1);
2197 PolyBP0 = fma(PolyBP0, t2, CBP0);
2198 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2200 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2201 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2202 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2203 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2204 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2205 PolyBQ0 = fma(PolyBQ0, t2, one);
2206 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2208 // The denominator polynomial can be zero outside the range
2209 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
2211 res_erfcB = res_erfcB * xabs;
2213 // Calculate erfc() in range [4.5,inf]
2214 w = maskzInv(xabs, xabs != setZero());
2217 PolyCP0 = fma(CCP6, w2, CCP4);
2218 PolyCP1 = fma(CCP5, w2, CCP3);
2219 PolyCP0 = fma(PolyCP0, w2, CCP2);
2220 PolyCP1 = fma(PolyCP1, w2, CCP1);
2221 PolyCP0 = fma(PolyCP0, w2, CCP0);
2222 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2224 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2225 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2226 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2227 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2228 PolyCQ0 = fma(PolyCQ0, w2, one);
2229 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2231 expmx2 = exp( -x2 );
2233 // The denominator polynomial can be zero outside the range
2234 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
2235 res_erfcC = res_erfcC + CCoffset;
2236 res_erfcC = res_erfcC * w;
2238 mask = (SimdDouble(4.5) < xabs);
2239 res_erfc = blend(res_erfcB, res_erfcC, mask);
2241 res_erfc = res_erfc * expmx2;
2243 // erfc(x<0) = 2-erfc(|x|)
2244 mask = (x < setZero());
2245 res_erfc = blend(res_erfc, two - res_erfc, mask);
2247 // Select erf() or erfc()
2248 res = blend(res_erfc, one - res_erf, mask_erf);
2253 /*! \brief SIMD double sin \& cos.
2255 * \param x The argument to evaluate sin/cos for
2256 * \param[out] sinval Sin(x)
2257 * \param[out] cosval Cos(x)
2259 * This version achieves close to machine precision, but for very large
2260 * magnitudes of the argument we inherently begin to lose accuracy due to the
2261 * argument reduction, despite using extended precision arithmetics internally.
2263 static inline void gmx_simdcall
2264 sincos(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
2266 // Constants to subtract Pi/4*x from y while minimizing precision loss
2267 const SimdDouble argred0(-2*0.78539816290140151978);
2268 const SimdDouble argred1(-2*4.9604678871439933374e-10);
2269 const SimdDouble argred2(-2*1.1258708853173288931e-18);
2270 const SimdDouble argred3(-2*1.7607799325916000908e-27);
2271 const SimdDouble two_over_pi(2.0/M_PI);
2272 const SimdDouble const_sin5( 1.58938307283228937328511e-10);
2273 const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2274 const SimdDouble const_sin3( 2.75573131776846360512547e-06);
2275 const SimdDouble const_sin2(-0.000198412698278911770864914);
2276 const SimdDouble const_sin1( 0.0083333333333191845961746);
2277 const SimdDouble const_sin0(-0.166666666666666130709393);
2279 const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2280 const SimdDouble const_cos6( 2.08757471207040055479366e-09);
2281 const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2282 const SimdDouble const_cos4( 2.48015872890001867311915e-05);
2283 const SimdDouble const_cos3(-0.00138888888888714019282329);
2284 const SimdDouble const_cos2( 0.0416666666666665519592062);
2285 const SimdDouble half(0.5);
2286 const SimdDouble one(1.0);
2287 SimdDouble ssign, csign;
2288 SimdDouble x2, y, z, psin, pcos, sss, ccc;
2290 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2291 const SimdDInt32 ione(1);
2292 const SimdDInt32 itwo(2);
2295 z = x * two_over_pi;
2299 mask = cvtIB2B((iy & ione) == setZero());
2300 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2301 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2303 const SimdDouble quarter(0.25);
2304 const SimdDouble minusquarter(-0.25);
2306 SimdDBool m1, m2, m3;
2308 /* The most obvious way to find the arguments quadrant in the unit circle
2309 * to calculate the sign is to use integer arithmetic, but that is not
2310 * present in all SIMD implementations. As an alternative, we have devised a
2311 * pure floating-point algorithm that uses truncation for argument reduction
2312 * so that we get a new value 0<=q<1 over the unit circle, and then
2313 * do floating-point comparisons with fractions. This is likely to be
2314 * slightly slower (~10%) due to the longer latencies of floating-point, so
2315 * we only use it when integer SIMD arithmetic is not present.
2319 // It is critical that half-way cases are rounded down
2320 z = fma(x, two_over_pi, half);
2324 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2325 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2326 * This removes the 2*Pi periodicity without using any integer arithmetic.
2327 * First check if y had the value 2 or 3, set csign if true.
2330 /* If we have logical operations we can work directly on the signbit, which
2331 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2332 * Thus, if you are altering defines to debug alternative code paths, the
2333 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2334 * active or inactive - you will get errors if only one is used.
2336 # if GMX_SIMD_HAVE_LOGICAL
2337 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2338 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2339 ssign = ssign ^ csign;
2341 ssign = copysign(SimdDouble(1.0), ssign);
2342 csign = copysign(SimdDouble(1.0), q);
2344 ssign = ssign * csign; // swap ssign if csign was set.
2346 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2347 m1 = (q < minusquarter);
2348 m2 = (setZero() <= q);
2352 // where mask is FALSE, swap sign.
2353 csign = csign * blend(SimdDouble(-1.0), one, mask);
2355 x = fma(y, argred0, x);
2356 x = fma(y, argred1, x);
2357 x = fma(y, argred2, x);
2358 x = fma(y, argred3, x);
2361 psin = fma(const_sin5, x2, const_sin4);
2362 psin = fma(psin, x2, const_sin3);
2363 psin = fma(psin, x2, const_sin2);
2364 psin = fma(psin, x2, const_sin1);
2365 psin = fma(psin, x2, const_sin0);
2366 psin = fma(psin, x2 * x, x);
2368 pcos = fma(const_cos7, x2, const_cos6);
2369 pcos = fma(pcos, x2, const_cos5);
2370 pcos = fma(pcos, x2, const_cos4);
2371 pcos = fma(pcos, x2, const_cos3);
2372 pcos = fma(pcos, x2, const_cos2);
2373 pcos = fms(pcos, x2, half);
2374 pcos = fma(pcos, x2, one);
2376 sss = blend(pcos, psin, mask);
2377 ccc = blend(psin, pcos, mask);
2378 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2379 #if GMX_SIMD_HAVE_LOGICAL
2380 *sinval = sss ^ ssign;
2381 *cosval = ccc ^ csign;
2383 *sinval = sss * ssign;
2384 *cosval = ccc * csign;
2388 /*! \brief SIMD double sin(x).
2390 * \param x The argument to evaluate sin for
2393 * \attention Do NOT call both sin & cos if you need both results, since each of them
2394 * will then call \ref sincos and waste a factor 2 in performance.
2396 static inline SimdDouble gmx_simdcall
2404 /*! \brief SIMD double cos(x).
2406 * \param x The argument to evaluate cos for
2409 * \attention Do NOT call both sin & cos if you need both results, since each of them
2410 * will then call \ref sincos and waste a factor 2 in performance.
2412 static inline SimdDouble gmx_simdcall
2420 /*! \brief SIMD double tan(x).
2422 * \param x The argument to evaluate tan for
2425 static inline SimdDouble gmx_simdcall
2428 const SimdDouble argred0(-2*0.78539816290140151978);
2429 const SimdDouble argred1(-2*4.9604678871439933374e-10);
2430 const SimdDouble argred2(-2*1.1258708853173288931e-18);
2431 const SimdDouble argred3(-2*1.7607799325916000908e-27);
2432 const SimdDouble two_over_pi(2.0/M_PI);
2433 const SimdDouble CT15(1.01419718511083373224408e-05);
2434 const SimdDouble CT14(-2.59519791585924697698614e-05);
2435 const SimdDouble CT13(5.23388081915899855325186e-05);
2436 const SimdDouble CT12(-3.05033014433946488225616e-05);
2437 const SimdDouble CT11(7.14707504084242744267497e-05);
2438 const SimdDouble CT10(8.09674518280159187045078e-05);
2439 const SimdDouble CT9(0.000244884931879331847054404);
2440 const SimdDouble CT8(0.000588505168743587154904506);
2441 const SimdDouble CT7(0.00145612788922812427978848);
2442 const SimdDouble CT6(0.00359208743836906619142924);
2443 const SimdDouble CT5(0.00886323944362401618113356);
2444 const SimdDouble CT4(0.0218694882853846389592078);
2445 const SimdDouble CT3(0.0539682539781298417636002);
2446 const SimdDouble CT2(0.133333333333125941821962);
2447 const SimdDouble CT1(0.333333333333334980164153);
2449 SimdDouble x2, p, y, z;
2452 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2456 z = x * two_over_pi;
2459 m = cvtIB2B((iy & ione) == ione);
2461 x = fma(y, argred0, x);
2462 x = fma(y, argred1, x);
2463 x = fma(y, argred2, x);
2464 x = fma(y, argred3, x);
2465 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2467 const SimdDouble quarter(0.25);
2468 const SimdDouble half(0.5);
2469 const SimdDouble threequarter(0.75);
2471 SimdDBool m1, m2, m3;
2474 z = fma(w, two_over_pi, half);
2478 m1 = (quarter <= q);
2480 m3 = (threequarter <= q);
2483 w = fma(y, argred0, w);
2484 w = fma(y, argred1, w);
2485 w = fma(y, argred2, w);
2486 w = fma(y, argred3, w);
2488 w = blend(w, -w, m);
2489 x = w * copysign( SimdDouble(1.0), x );
2492 p = fma(CT15, x2, CT14);
2493 p = fma(p, x2, CT13);
2494 p = fma(p, x2, CT12);
2495 p = fma(p, x2, CT11);
2496 p = fma(p, x2, CT10);
2497 p = fma(p, x2, CT9);
2498 p = fma(p, x2, CT8);
2499 p = fma(p, x2, CT7);
2500 p = fma(p, x2, CT6);
2501 p = fma(p, x2, CT5);
2502 p = fma(p, x2, CT4);
2503 p = fma(p, x2, CT3);
2504 p = fma(p, x2, CT2);
2505 p = fma(p, x2, CT1);
2506 p = fma(x2, p * x, x);
2508 p = blend( p, maskzInv(p, m), m);
2512 /*! \brief SIMD double asin(x).
2514 * \param x The argument to evaluate asin for
2517 static inline SimdDouble gmx_simdcall
2520 // Same algorithm as cephes library
2521 const SimdDouble limit1(0.625);
2522 const SimdDouble limit2(1e-8);
2523 const SimdDouble one(1.0);
2524 const SimdDouble quarterpi(M_PI/4.0);
2525 const SimdDouble morebits(6.123233995736765886130e-17);
2527 const SimdDouble P5(4.253011369004428248960e-3);
2528 const SimdDouble P4(-6.019598008014123785661e-1);
2529 const SimdDouble P3(5.444622390564711410273e0);
2530 const SimdDouble P2(-1.626247967210700244449e1);
2531 const SimdDouble P1(1.956261983317594739197e1);
2532 const SimdDouble P0(-8.198089802484824371615e0);
2534 const SimdDouble Q4(-1.474091372988853791896e1);
2535 const SimdDouble Q3(7.049610280856842141659e1);
2536 const SimdDouble Q2(-1.471791292232726029859e2);
2537 const SimdDouble Q1(1.395105614657485689735e2);
2538 const SimdDouble Q0(-4.918853881490881290097e1);
2540 const SimdDouble R4(2.967721961301243206100e-3);
2541 const SimdDouble R3(-5.634242780008963776856e-1);
2542 const SimdDouble R2(6.968710824104713396794e0);
2543 const SimdDouble R1(-2.556901049652824852289e1);
2544 const SimdDouble R0(2.853665548261061424989e1);
2546 const SimdDouble S3(-2.194779531642920639778e1);
2547 const SimdDouble S2(1.470656354026814941758e2);
2548 const SimdDouble S1(-3.838770957603691357202e2);
2549 const SimdDouble S0(3.424398657913078477438e2);
2552 SimdDouble zz, ww, z, q, w, zz2, ww2;
2557 SimdDouble nom, denom;
2558 SimdDBool mask, mask2;
2562 mask = (limit1 < xabs);
2570 RA = fma(R4, zz2, R2);
2571 RB = fma(R3, zz2, R1);
2572 RA = fma(RA, zz2, R0);
2573 RA = fma(RB, zz, RA);
2576 SB = fma(S3, zz2, S1);
2578 SA = fma(SA, zz2, S0);
2579 SA = fma(SB, zz, SA);
2582 PA = fma(P5, ww2, P3);
2583 PB = fma(P4, ww2, P2);
2584 PA = fma(PA, ww2, P1);
2585 PB = fma(PB, ww2, P0);
2586 PA = fma(PA, ww, PB);
2589 QB = fma(Q4, ww2, Q2);
2591 QA = fma(QA, ww2, Q1);
2592 QB = fma(QB, ww2, Q0);
2593 QA = fma(QA, ww, QB);
2598 nom = blend( PA, RA, mask );
2599 denom = blend( QA, SA, mask );
2601 mask2 = (limit2 < xabs);
2602 q = nom * maskzInv(denom, mask2);
2607 zz = fms(zz, q, morebits);
2614 z = blend( w, z, mask );
2616 z = blend( xabs, z, mask2 );
2623 /*! \brief SIMD double acos(x).
2625 * \param x The argument to evaluate acos for
2628 static inline SimdDouble gmx_simdcall
2631 const SimdDouble one(1.0);
2632 const SimdDouble half(0.5);
2633 const SimdDouble quarterpi0(7.85398163397448309616e-1);
2634 const SimdDouble quarterpi1(6.123233995736765886130e-17);
2637 SimdDouble z, z1, z2;
2640 z1 = half * (one - x);
2642 z = blend( x, z1, mask1 );
2648 z2 = quarterpi0 - z;
2649 z2 = z2 + quarterpi1;
2650 z2 = z2 + quarterpi0;
2652 z = blend(z2, z1, mask1);
2657 /*! \brief SIMD double asin(x).
2659 * \param x The argument to evaluate atan for
2660 * \result Atan(x), same argument/value range as standard math library.
2662 static inline SimdDouble gmx_simdcall
2665 // Same algorithm as cephes library
2666 const SimdDouble limit1(0.66);
2667 const SimdDouble limit2(2.41421356237309504880);
2668 const SimdDouble quarterpi(M_PI/4.0);
2669 const SimdDouble halfpi(M_PI/2.0);
2670 const SimdDouble mone(-1.0);
2671 const SimdDouble morebits1(0.5*6.123233995736765886130E-17);
2672 const SimdDouble morebits2(6.123233995736765886130E-17);
2674 const SimdDouble P4(-8.750608600031904122785E-1);
2675 const SimdDouble P3(-1.615753718733365076637E1);
2676 const SimdDouble P2(-7.500855792314704667340E1);
2677 const SimdDouble P1(-1.228866684490136173410E2);
2678 const SimdDouble P0(-6.485021904942025371773E1);
2680 const SimdDouble Q4(2.485846490142306297962E1);
2681 const SimdDouble Q3(1.650270098316988542046E2);
2682 const SimdDouble Q2(4.328810604912902668951E2);
2683 const SimdDouble Q1(4.853903996359136964868E2);
2684 const SimdDouble Q0(1.945506571482613964425E2);
2686 SimdDouble y, xabs, t1, t2;
2688 SimdDouble P_A, P_B, Q_A, Q_B;
2689 SimdDBool mask1, mask2;
2693 mask1 = (limit1 < xabs);
2694 mask2 = (limit2 < xabs);
2696 t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
2697 t2 = mone * maskzInv(xabs, mask2);
2699 y = selectByMask(quarterpi, mask1);
2700 y = blend(y, halfpi, mask2);
2701 xabs = blend(xabs, t1, mask1);
2702 xabs = blend(xabs, t2, mask2);
2707 P_A = fma(P4, z2, P2);
2708 P_B = fma(P3, z2, P1);
2709 P_A = fma(P_A, z2, P0);
2710 P_A = fma(P_B, z, P_A);
2713 Q_B = fma(Q4, z2, Q2);
2715 Q_A = fma(Q_A, z2, Q1);
2716 Q_B = fma(Q_B, z2, Q0);
2717 Q_A = fma(Q_A, z, Q_B);
2721 z = fma(z, xabs, xabs);
2723 t1 = selectByMask(morebits1, mask1);
2724 t1 = blend(t1, morebits2, mask2);
2734 /*! \brief SIMD double atan2(y,x).
2736 * \param y Y component of vector, any quartile
2737 * \param x X component of vector, any quartile
2738 * \result Atan(y,x), same argument/value range as standard math library.
2740 * \note This routine should provide correct results for all finite
2741 * non-zero or positive-zero arguments. However, negative zero arguments will
2742 * be treated as positive zero, which means the return value will deviate from
2743 * the standard math library atan2(y,x) for those cases. That should not be
2744 * of any concern in Gromacs, and in particular it will not affect calculations
2745 * of angles from vectors.
2747 static inline SimdDouble gmx_simdcall
2748 atan2(SimdDouble y, SimdDouble x)
2750 const SimdDouble pi(M_PI);
2751 const SimdDouble halfpi(M_PI/2.0);
2752 SimdDouble xinv, p, aoffset;
2753 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
2755 mask_xnz = x != setZero();
2756 mask_ynz = y != setZero();
2757 mask_xlt0 = (x < setZero());
2758 mask_ylt0 = (y < setZero());
2760 aoffset = selectByNotMask(halfpi, mask_xnz);
2761 aoffset = selectByMask(aoffset, mask_ynz);
2763 aoffset = blend(aoffset, pi, mask_xlt0);
2764 aoffset = blend(aoffset, -aoffset, mask_ylt0);
2766 xinv = maskzInv(x, mask_xnz);
2775 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
2777 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2778 * interaction distance and beta the ewald splitting parameters.
2779 * \result Correction factor to coulomb force.
2781 * This routine is meant to enable analytical evaluation of the
2782 * direct-space PME electrostatic force to avoid tables. For details, see the
2783 * single precision function.
2785 static inline SimdDouble gmx_simdcall
2786 pmeForceCorrection(SimdDouble z2)
2788 const SimdDouble FN10(-8.0072854618360083154e-14);
2789 const SimdDouble FN9(1.1859116242260148027e-11);
2790 const SimdDouble FN8(-8.1490406329798423616e-10);
2791 const SimdDouble FN7(3.4404793543907847655e-8);
2792 const SimdDouble FN6(-9.9471420832602741006e-7);
2793 const SimdDouble FN5(0.000020740315999115847456);
2794 const SimdDouble FN4(-0.00031991745139313364005);
2795 const SimdDouble FN3(0.0035074449373659008203);
2796 const SimdDouble FN2(-0.031750380176100813405);
2797 const SimdDouble FN1(0.13884101728898463426);
2798 const SimdDouble FN0(-0.75225277815249618847);
2800 const SimdDouble FD5(0.000016009278224355026701);
2801 const SimdDouble FD4(0.00051055686934806966046);
2802 const SimdDouble FD3(0.0081803507497974289008);
2803 const SimdDouble FD2(0.077181146026670287235);
2804 const SimdDouble FD1(0.41543303143712535988);
2805 const SimdDouble FD0(1.0);
2808 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
2812 polyFD1 = fma(FD5, z4, FD3);
2813 polyFD1 = fma(polyFD1, z4, FD1);
2814 polyFD1 = polyFD1 * z2;
2815 polyFD0 = fma(FD4, z4, FD2);
2816 polyFD0 = fma(polyFD0, z4, FD0);
2817 polyFD0 = polyFD0 + polyFD1;
2819 polyFD0 = inv(polyFD0);
2821 polyFN0 = fma(FN10, z4, FN8);
2822 polyFN0 = fma(polyFN0, z4, FN6);
2823 polyFN0 = fma(polyFN0, z4, FN4);
2824 polyFN0 = fma(polyFN0, z4, FN2);
2825 polyFN0 = fma(polyFN0, z4, FN0);
2826 polyFN1 = fma(FN9, z4, FN7);
2827 polyFN1 = fma(polyFN1, z4, FN5);
2828 polyFN1 = fma(polyFN1, z4, FN3);
2829 polyFN1 = fma(polyFN1, z4, FN1);
2830 polyFN0 = fma(polyFN1, z2, polyFN0);
2833 return polyFN0 * polyFD0;
2838 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
2840 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2841 * interaction distance and beta the ewald splitting parameters.
2842 * \result Correction factor to coulomb force.
2844 * This routine is meant to enable analytical evaluation of the
2845 * direct-space PME electrostatic potential to avoid tables. For details, see the
2846 * single precision function.
2848 static inline SimdDouble gmx_simdcall
2849 pmePotentialCorrection(SimdDouble z2)
2851 const SimdDouble VN9(-9.3723776169321855475e-13);
2852 const SimdDouble VN8(1.2280156762674215741e-10);
2853 const SimdDouble VN7(-7.3562157912251309487e-9);
2854 const SimdDouble VN6(2.6215886208032517509e-7);
2855 const SimdDouble VN5(-4.9532491651265819499e-6);
2856 const SimdDouble VN4(0.00025907400778966060389);
2857 const SimdDouble VN3(0.0010585044856156469792);
2858 const SimdDouble VN2(0.045247661136833092885);
2859 const SimdDouble VN1(0.11643931522926034421);
2860 const SimdDouble VN0(1.1283791671726767970);
2862 const SimdDouble VD5(0.000021784709867336150342);
2863 const SimdDouble VD4(0.00064293662010911388448);
2864 const SimdDouble VD3(0.0096311444822588683504);
2865 const SimdDouble VD2(0.085608012351550627051);
2866 const SimdDouble VD1(0.43652499166614811084);
2867 const SimdDouble VD0(1.0);
2870 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
2874 polyVD1 = fma(VD5, z4, VD3);
2875 polyVD0 = fma(VD4, z4, VD2);
2876 polyVD1 = fma(polyVD1, z4, VD1);
2877 polyVD0 = fma(polyVD0, z4, VD0);
2878 polyVD0 = fma(polyVD1, z2, polyVD0);
2880 polyVD0 = inv(polyVD0);
2882 polyVN1 = fma(VN9, z4, VN7);
2883 polyVN0 = fma(VN8, z4, VN6);
2884 polyVN1 = fma(polyVN1, z4, VN5);
2885 polyVN0 = fma(polyVN0, z4, VN4);
2886 polyVN1 = fma(polyVN1, z4, VN3);
2887 polyVN0 = fma(polyVN0, z4, VN2);
2888 polyVN1 = fma(polyVN1, z4, VN1);
2889 polyVN0 = fma(polyVN0, z4, VN0);
2890 polyVN0 = fma(polyVN1, z2, polyVN0);
2892 return polyVN0 * polyVD0;
2898 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2900 * \note In some cases we do not need full double accuracy of individual
2901 * SIMD math functions, although the data is stored in double precision
2902 * SIMD registers. This might be the case for special algorithms, or
2903 * if the architecture does not support single precision.
2904 * Since the full double precision evaluation of math functions
2905 * typically require much more expensive polynomial approximations
2906 * these functions implement the algorithms used in the single precision
2907 * SIMD math functions, but they operate on double precision
2913 /*********************************************************************
2914 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2915 *********************************************************************/
2917 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2919 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2920 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2921 * For the single precision implementation this is obviously always
2922 * true for positive values, but for double precision it adds an
2923 * extra restriction since the first lookup step might have to be
2924 * performed in single precision on some architectures. Note that the
2925 * responsibility for checking falls on you - this routine does not
2928 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
2930 static inline SimdDouble gmx_simdcall
2931 invsqrtSingleAccuracy(SimdDouble x)
2933 SimdDouble lu = rsqrt(x);
2934 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2935 lu = rsqrtIter(lu, x);
2937 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2938 lu = rsqrtIter(lu, x);
2940 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2941 lu = rsqrtIter(lu, x);
2946 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
2948 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
2949 * Illegal values in the masked-out elements will not lead to
2950 * floating-point exceptions.
2952 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
2953 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2954 * For the single precision implementation this is obviously always
2955 * true for positive values, but for double precision it adds an
2956 * extra restriction since the first lookup step might have to be
2957 * performed in single precision on some architectures. Note that the
2958 * responsibility for checking falls on you - this routine does not
2962 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
2963 * entry was not masked, and 0.0 for masked-out entries.
2965 static inline SimdDouble
2966 maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
2968 SimdDouble lu = maskzRsqrt(x, m);
2969 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2970 lu = rsqrtIter(lu, x);
2972 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2973 lu = rsqrtIter(lu, x);
2975 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2976 lu = rsqrtIter(lu, x);
2981 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2983 * \param x0 First set of arguments, x0 must be in single range (see below).
2984 * \param x1 Second set of arguments, x1 must be in single range (see below).
2985 * \param[out] out0 Result 1/sqrt(x0)
2986 * \param[out] out1 Result 1/sqrt(x1)
2988 * In particular for double precision we can sometimes calculate square root
2989 * pairs slightly faster by using single precision until the very last step.
2991 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
2992 * GMX_FLOAT_MAX, i.e. within the range of single precision.
2993 * For the single precision implementation this is obviously always
2994 * true for positive values, but for double precision it adds an
2995 * extra restriction since the first lookup step might have to be
2996 * performed in single precision on some architectures. Note that the
2997 * responsibility for checking falls on you - this routine does not
3000 static inline void gmx_simdcall
3001 invsqrtPairSingleAccuracy(SimdDouble x0, SimdDouble x1,
3002 SimdDouble *out0, SimdDouble *out1)
3004 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
3005 SimdFloat xf = cvtDD2F(x0, x1);
3006 SimdFloat luf = rsqrt(xf);
3007 SimdDouble lu0, lu1;
3008 // Intermediate target is single - mantissa+1 bits
3009 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3010 luf = rsqrtIter(luf, xf);
3012 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3013 luf = rsqrtIter(luf, xf);
3015 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3016 luf = rsqrtIter(luf, xf);
3018 cvtF2DD(luf, &lu0, &lu1);
3019 // We now have single-precision accuracy values in lu0/lu1
3023 *out0 = invsqrtSingleAccuracy(x0);
3024 *out1 = invsqrtSingleAccuracy(x1);
3028 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3030 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3031 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3032 * For the single precision implementation this is obviously always
3033 * true for positive values, but for double precision it adds an
3034 * extra restriction since the first lookup step might have to be
3035 * performed in single precision on some architectures. Note that the
3036 * responsibility for checking falls on you - this routine does not
3039 * \return 1/x. Result is undefined if your argument was invalid.
3041 static inline SimdDouble gmx_simdcall
3042 invSingleAccuracy(SimdDouble x)
3044 SimdDouble lu = rcp(x);
3045 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3046 lu = rcpIter(lu, x);
3048 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3049 lu = rcpIter(lu, x);
3051 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3052 lu = rcpIter(lu, x);
3057 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3059 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3060 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3061 * For the single precision implementation this is obviously always
3062 * true for positive values, but for double precision it adds an
3063 * extra restriction since the first lookup step might have to be
3064 * performed in single precision on some architectures. Note that the
3065 * responsibility for checking falls on you - this routine does not
3069 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3071 static inline SimdDouble gmx_simdcall
3072 maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3074 SimdDouble lu = maskzRcp(x, m);
3075 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3076 lu = rcpIter(lu, x);
3078 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3079 lu = rcpIter(lu, x);
3081 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3082 lu = rcpIter(lu, x);
3088 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3090 * \copydetails sqrt(SimdFloat)
3092 template <MathOptimization opt = MathOptimization::Safe>
3093 static inline SimdDouble gmx_simdcall
3094 sqrtSingleAccuracy(SimdDouble x)
3096 if (opt == MathOptimization::Safe)
3098 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3103 return x * invsqrtSingleAccuracy(x);
3108 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3110 * \param x Argument, should be >0.
3111 * \result The natural logarithm of x. Undefined if argument is invalid.
3113 static inline SimdDouble gmx_simdcall
3114 logSingleAccuracy(SimdDouble x)
3116 const SimdDouble one(1.0);
3117 const SimdDouble two(2.0);
3118 const SimdDouble sqrt2(std::sqrt(2.0));
3119 const SimdDouble corr(0.693147180559945286226764);
3120 const SimdDouble CL9(0.2371599674224853515625);
3121 const SimdDouble CL7(0.285279005765914916992188);
3122 const SimdDouble CL5(0.400005519390106201171875);
3123 const SimdDouble CL3(0.666666567325592041015625);
3124 const SimdDouble CL1(2.0);
3125 SimdDouble fexp, x2, p;
3129 x = frexp(x, &iexp);
3130 fexp = cvtI2R(iexp);
3133 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3134 fexp = fexp - selectByMask(one, mask);
3135 x = x * blend(one, two, mask);
3137 x = (x - one) * invSingleAccuracy( x + one );
3140 p = fma(CL9, x2, CL7);
3141 p = fma(p, x2, CL5);
3142 p = fma(p, x2, CL3);
3143 p = fma(p, x2, CL1);
3144 p = fma(p, x, corr * fexp);
3149 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3151 * \copydetails exp2(SimdFloat)
3153 template <MathOptimization opt = MathOptimization::Safe>
3154 static inline SimdDouble gmx_simdcall
3155 exp2SingleAccuracy(SimdDouble x)
3157 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3158 const SimdDouble smallArgLimit(-1023.0);
3159 const SimdDouble CC6(0.0001534581200287996416911311);
3160 const SimdDouble CC5(0.001339993121934088894618990);
3161 const SimdDouble CC4(0.009618488957115180159497841);
3162 const SimdDouble CC3(0.05550328776964726865751735);
3163 const SimdDouble CC2(0.2402264689063408646490722);
3164 const SimdDouble CC1(0.6931472057372680777553816);
3165 const SimdDouble one(1.0);
3171 // Large negative values are valid arguments to exp2(), so there are two
3172 // things we need to account for:
3173 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3174 // zero and we can no longer multiply with it. There are special IEEE
3175 // formats to handle this range, but for now we have to accept that
3176 // we cannot handle those arguments. If input value becomes even more
3177 // negative, it will start to loop and we would end up with invalid
3178 // exponents. Thus, we need to limit or mask this.
3179 // 2. For VERY large negative values, we will have problems that the
3180 // subtraction to get the fractional part loses accuracy, and then we
3181 // can end up with overflows in the polynomial.
3183 // For now, we handle this by clamping smaller arguments to -1023. At this
3184 // point we will already return zero, so we don't need to do anything
3185 // extra for the exponent.
3187 if (opt == MathOptimization::Safe)
3189 x = max(x, smallArgLimit);
3196 p = fma(CC6, x, CC5);
3209 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3211 * \copydetails exp(SimdFloat)
3213 template <MathOptimization opt = MathOptimization::Safe>
3214 static inline SimdDouble gmx_simdcall
3215 expSingleAccuracy(SimdDouble x)
3217 const SimdDouble argscale(1.44269504088896341);
3218 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3219 const SimdDouble smallArgLimit(-709.0895657128);
3220 const SimdDouble invargscale(-0.69314718055994528623);
3221 const SimdDouble CC4(0.00136324646882712841033936);
3222 const SimdDouble CC3(0.00836596917361021041870117);
3223 const SimdDouble CC2(0.0416710823774337768554688);
3224 const SimdDouble CC1(0.166665524244308471679688);
3225 const SimdDouble CC0(0.499999850988388061523438);
3226 const SimdDouble one(1.0);
3231 // Large negative values are valid arguments to exp2(), so there are two
3232 // things we need to account for:
3233 // 1. When the exponents reaches -127, the (biased) exponent field will be
3234 // zero and we can no longer multiply with it. There are special IEEE
3235 // formats to handle this range, but for now we have to accept that
3236 // we cannot handle those arguments. If input value becomes even more
3237 // negative, it will start to loop and we would end up with invalid
3238 // exponents. Thus, we need to limit or mask this.
3239 // 2. For VERY large negative values, we will have problems that the
3240 // subtraction to get the fractional part loses accuracy, and then we
3241 // can end up with overflows in the polynomial.
3243 // For now, we handle this by clamping smaller arguments to -127. At this
3244 // point we will already return zero, so we don't need to do anything
3245 // extra for the exponent.
3247 if (opt == MathOptimization::Safe)
3249 x = max(x, smallArgLimit);
3254 intpart = round(y); // use same rounding algorithm here
3256 // Extended precision arithmetics not needed since
3257 // we have double precision and only need single accuracy.
3258 x = fma(invargscale, intpart, x);
3260 p = fma(CC4, x, CC3);
3271 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3273 * \param x The value to calculate erf(x) for.
3276 * This routine achieves very close to single precision, but we do not care about
3277 * the last bit or the subnormal result range.
3279 static inline SimdDouble gmx_simdcall
3280 erfSingleAccuracy(SimdDouble x)
3282 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3283 const SimdDouble CA6(7.853861353153693e-5);
3284 const SimdDouble CA5(-8.010193625184903e-4);
3285 const SimdDouble CA4(5.188327685732524e-3);
3286 const SimdDouble CA3(-2.685381193529856e-2);
3287 const SimdDouble CA2(1.128358514861418e-1);
3288 const SimdDouble CA1(-3.761262582423300e-1);
3289 const SimdDouble CA0(1.128379165726710);
3290 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3291 const SimdDouble CB9(-0.0018629930017603923);
3292 const SimdDouble CB8(0.003909821287598495);
3293 const SimdDouble CB7(-0.0052094582210355615);
3294 const SimdDouble CB6(0.005685614362160572);
3295 const SimdDouble CB5(-0.0025367682853477272);
3296 const SimdDouble CB4(-0.010199799682318782);
3297 const SimdDouble CB3(0.04369575504816542);
3298 const SimdDouble CB2(-0.11884063474674492);
3299 const SimdDouble CB1(0.2732120154030589);
3300 const SimdDouble CB0(0.42758357702025784);
3301 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3302 const SimdDouble CC10(-0.0445555913112064);
3303 const SimdDouble CC9(0.21376355144663348);
3304 const SimdDouble CC8(-0.3473187200259257);
3305 const SimdDouble CC7(0.016690861551248114);
3306 const SimdDouble CC6(0.7560973182491192);
3307 const SimdDouble CC5(-1.2137903600145787);
3308 const SimdDouble CC4(0.8411872321232948);
3309 const SimdDouble CC3(-0.08670413896296343);
3310 const SimdDouble CC2(-0.27124782687240334);
3311 const SimdDouble CC1(-0.0007502488047806069);
3312 const SimdDouble CC0(0.5642114853803148);
3313 const SimdDouble one(1.0);
3314 const SimdDouble two(2.0);
3316 SimdDouble x2, x4, y;
3317 SimdDouble t, t2, w, w2;
3318 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3320 SimdDouble res_erf, res_erfc, res;
3321 SimdDBool mask, msk_erf;
3327 pA0 = fma(CA6, x4, CA4);
3328 pA1 = fma(CA5, x4, CA3);
3329 pA0 = fma(pA0, x4, CA2);
3330 pA1 = fma(pA1, x4, CA1);
3332 pA0 = fma(pA1, x2, pA0);
3333 // Constant term must come last for precision reasons
3340 msk_erf = (SimdDouble(0.75) <= y);
3341 t = maskzInvSingleAccuracy(y, msk_erf);
3346 expmx2 = expSingleAccuracy( -y*y );
3348 pB1 = fma(CB9, w2, CB7);
3349 pB0 = fma(CB8, w2, CB6);
3350 pB1 = fma(pB1, w2, CB5);
3351 pB0 = fma(pB0, w2, CB4);
3352 pB1 = fma(pB1, w2, CB3);
3353 pB0 = fma(pB0, w2, CB2);
3354 pB1 = fma(pB1, w2, CB1);
3355 pB0 = fma(pB0, w2, CB0);
3356 pB0 = fma(pB1, w, pB0);
3358 pC0 = fma(CC10, t2, CC8);
3359 pC1 = fma(CC9, t2, CC7);
3360 pC0 = fma(pC0, t2, CC6);
3361 pC1 = fma(pC1, t2, CC5);
3362 pC0 = fma(pC0, t2, CC4);
3363 pC1 = fma(pC1, t2, CC3);
3364 pC0 = fma(pC0, t2, CC2);
3365 pC1 = fma(pC1, t2, CC1);
3367 pC0 = fma(pC0, t2, CC0);
3368 pC0 = fma(pC1, t, pC0);
3371 // Select pB0 or pC0 for erfc()
3373 res_erfc = blend(pB0, pC0, mask);
3374 res_erfc = res_erfc * expmx2;
3376 // erfc(x<0) = 2-erfc(|x|)
3377 mask = (x < setZero());
3378 res_erfc = blend(res_erfc, two - res_erfc, mask);
3380 // Select erf() or erfc()
3381 mask = (y < SimdDouble(0.75));
3382 res = blend(one - res_erfc, res_erf, mask);
3387 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3389 * \param x The value to calculate erfc(x) for.
3392 * This routine achieves singleprecision (bar the last bit) over most of the
3393 * input range, but for large arguments where the result is getting close
3394 * to the minimum representable numbers we accept slightly larger errors
3395 * (think results that are in the ballpark of 10^-30) since that is not
3398 static inline SimdDouble gmx_simdcall
3399 erfcSingleAccuracy(SimdDouble x)
3401 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3402 const SimdDouble CA6(7.853861353153693e-5);
3403 const SimdDouble CA5(-8.010193625184903e-4);
3404 const SimdDouble CA4(5.188327685732524e-3);
3405 const SimdDouble CA3(-2.685381193529856e-2);
3406 const SimdDouble CA2(1.128358514861418e-1);
3407 const SimdDouble CA1(-3.761262582423300e-1);
3408 const SimdDouble CA0(1.128379165726710);
3409 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3410 const SimdDouble CB9(-0.0018629930017603923);
3411 const SimdDouble CB8(0.003909821287598495);
3412 const SimdDouble CB7(-0.0052094582210355615);
3413 const SimdDouble CB6(0.005685614362160572);
3414 const SimdDouble CB5(-0.0025367682853477272);
3415 const SimdDouble CB4(-0.010199799682318782);
3416 const SimdDouble CB3(0.04369575504816542);
3417 const SimdDouble CB2(-0.11884063474674492);
3418 const SimdDouble CB1(0.2732120154030589);
3419 const SimdDouble CB0(0.42758357702025784);
3420 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3421 const SimdDouble CC10(-0.0445555913112064);
3422 const SimdDouble CC9(0.21376355144663348);
3423 const SimdDouble CC8(-0.3473187200259257);
3424 const SimdDouble CC7(0.016690861551248114);
3425 const SimdDouble CC6(0.7560973182491192);
3426 const SimdDouble CC5(-1.2137903600145787);
3427 const SimdDouble CC4(0.8411872321232948);
3428 const SimdDouble CC3(-0.08670413896296343);
3429 const SimdDouble CC2(-0.27124782687240334);
3430 const SimdDouble CC1(-0.0007502488047806069);
3431 const SimdDouble CC0(0.5642114853803148);
3432 const SimdDouble one(1.0);
3433 const SimdDouble two(2.0);
3435 SimdDouble x2, x4, y;
3436 SimdDouble t, t2, w, w2;
3437 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3439 SimdDouble res_erf, res_erfc, res;
3440 SimdDBool mask, msk_erf;
3446 pA0 = fma(CA6, x4, CA4);
3447 pA1 = fma(CA5, x4, CA3);
3448 pA0 = fma(pA0, x4, CA2);
3449 pA1 = fma(pA1, x4, CA1);
3451 pA0 = fma(pA0, x4, pA1);
3452 // Constant term must come last for precision reasons
3459 msk_erf = (SimdDouble(0.75) <= y);
3460 t = maskzInvSingleAccuracy(y, msk_erf);
3465 expmx2 = expSingleAccuracy( -y*y );
3467 pB1 = fma(CB9, w2, CB7);
3468 pB0 = fma(CB8, w2, CB6);
3469 pB1 = fma(pB1, w2, CB5);
3470 pB0 = fma(pB0, w2, CB4);
3471 pB1 = fma(pB1, w2, CB3);
3472 pB0 = fma(pB0, w2, CB2);
3473 pB1 = fma(pB1, w2, CB1);
3474 pB0 = fma(pB0, w2, CB0);
3475 pB0 = fma(pB1, w, pB0);
3477 pC0 = fma(CC10, t2, CC8);
3478 pC1 = fma(CC9, t2, CC7);
3479 pC0 = fma(pC0, t2, CC6);
3480 pC1 = fma(pC1, t2, CC5);
3481 pC0 = fma(pC0, t2, CC4);
3482 pC1 = fma(pC1, t2, CC3);
3483 pC0 = fma(pC0, t2, CC2);
3484 pC1 = fma(pC1, t2, CC1);
3486 pC0 = fma(pC0, t2, CC0);
3487 pC0 = fma(pC1, t, pC0);
3490 // Select pB0 or pC0 for erfc()
3492 res_erfc = blend(pB0, pC0, mask);
3493 res_erfc = res_erfc * expmx2;
3495 // erfc(x<0) = 2-erfc(|x|)
3496 mask = (x < setZero());
3497 res_erfc = blend(res_erfc, two - res_erfc, mask);
3499 // Select erf() or erfc()
3500 mask = (y < SimdDouble(0.75));
3501 res = blend(res_erfc, one - res_erf, mask);
3506 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3508 * \param x The argument to evaluate sin/cos for
3509 * \param[out] sinval Sin(x)
3510 * \param[out] cosval Cos(x)
3512 static inline void gmx_simdcall
3513 sinCosSingleAccuracy(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
3515 // Constants to subtract Pi/4*x from y while minimizing precision loss
3516 const SimdDouble argred0(2*0.78539816290140151978);
3517 const SimdDouble argred1(2*4.9604678871439933374e-10);
3518 const SimdDouble argred2(2*1.1258708853173288931e-18);
3519 const SimdDouble two_over_pi(2.0/M_PI);
3520 const SimdDouble const_sin2(-1.9515295891e-4);
3521 const SimdDouble const_sin1( 8.3321608736e-3);
3522 const SimdDouble const_sin0(-1.6666654611e-1);
3523 const SimdDouble const_cos2( 2.443315711809948e-5);
3524 const SimdDouble const_cos1(-1.388731625493765e-3);
3525 const SimdDouble const_cos0( 4.166664568298827e-2);
3527 const SimdDouble half(0.5);
3528 const SimdDouble one(1.0);
3529 SimdDouble ssign, csign;
3530 SimdDouble x2, y, z, psin, pcos, sss, ccc;
3533 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3534 const SimdDInt32 ione(1);
3535 const SimdDInt32 itwo(2);
3538 z = x * two_over_pi;
3542 mask = cvtIB2B((iy & ione) == setZero());
3543 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
3544 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
3546 const SimdDouble quarter(0.25);
3547 const SimdDouble minusquarter(-0.25);
3549 SimdDBool m1, m2, m3;
3551 /* The most obvious way to find the arguments quadrant in the unit circle
3552 * to calculate the sign is to use integer arithmetic, but that is not
3553 * present in all SIMD implementations. As an alternative, we have devised a
3554 * pure floating-point algorithm that uses truncation for argument reduction
3555 * so that we get a new value 0<=q<1 over the unit circle, and then
3556 * do floating-point comparisons with fractions. This is likely to be
3557 * slightly slower (~10%) due to the longer latencies of floating-point, so
3558 * we only use it when integer SIMD arithmetic is not present.
3562 // It is critical that half-way cases are rounded down
3563 z = fma(x, two_over_pi, half);
3567 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3568 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3569 * This removes the 2*Pi periodicity without using any integer arithmetic.
3570 * First check if y had the value 2 or 3, set csign if true.
3573 /* If we have logical operations we can work directly on the signbit, which
3574 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3575 * Thus, if you are altering defines to debug alternative code paths, the
3576 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3577 * active or inactive - you will get errors if only one is used.
3579 # if GMX_SIMD_HAVE_LOGICAL
3580 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
3581 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
3582 ssign = ssign ^ csign;
3584 ssign = copysign(SimdDouble(1.0), ssign);
3585 csign = copysign(SimdDouble(1.0), q);
3587 ssign = ssign * csign; // swap ssign if csign was set.
3589 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
3590 m1 = (q < minusquarter);
3591 m2 = (setZero() <= q);
3595 // where mask is FALSE, swap sign.
3596 csign = csign * blend(SimdDouble(-1.0), one, mask);
3598 x = fnma(y, argred0, x);
3599 x = fnma(y, argred1, x);
3600 x = fnma(y, argred2, x);
3603 psin = fma(const_sin2, x2, const_sin1);
3604 psin = fma(psin, x2, const_sin0);
3605 psin = fma(psin, x * x2, x);
3606 pcos = fma(const_cos2, x2, const_cos1);
3607 pcos = fma(pcos, x2, const_cos0);
3608 pcos = fms(pcos, x2, half);
3609 pcos = fma(pcos, x2, one);
3611 sss = blend(pcos, psin, mask);
3612 ccc = blend(psin, pcos, mask);
3613 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
3614 #if GMX_SIMD_HAVE_LOGICAL
3615 *sinval = sss ^ ssign;
3616 *cosval = ccc ^ csign;
3618 *sinval = sss * ssign;
3619 *cosval = ccc * csign;
3623 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3625 * \param x The argument to evaluate sin for
3628 * \attention Do NOT call both sin & cos if you need both results, since each of them
3629 * will then call \ref sincos and waste a factor 2 in performance.
3631 static inline SimdDouble gmx_simdcall
3632 sinSingleAccuracy(SimdDouble x)
3635 sinCosSingleAccuracy(x, &s, &c);
3639 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3641 * \param x The argument to evaluate cos for
3644 * \attention Do NOT call both sin & cos if you need both results, since each of them
3645 * will then call \ref sincos and waste a factor 2 in performance.
3647 static inline SimdDouble gmx_simdcall
3648 cosSingleAccuracy(SimdDouble x)
3651 sinCosSingleAccuracy(x, &s, &c);
3655 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3657 * \param x The argument to evaluate tan for
3660 static inline SimdDouble gmx_simdcall
3661 tanSingleAccuracy(SimdDouble x)
3663 const SimdDouble argred0(2*0.78539816290140151978);
3664 const SimdDouble argred1(2*4.9604678871439933374e-10);
3665 const SimdDouble argred2(2*1.1258708853173288931e-18);
3666 const SimdDouble two_over_pi(2.0/M_PI);
3667 const SimdDouble CT6(0.009498288995810566122993911);
3668 const SimdDouble CT5(0.002895755790837379295226923);
3669 const SimdDouble CT4(0.02460087336161924491836265);
3670 const SimdDouble CT3(0.05334912882656359828045988);
3671 const SimdDouble CT2(0.1333989091464957704418495);
3672 const SimdDouble CT1(0.3333307599244198227797507);
3674 SimdDouble x2, p, y, z;
3677 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3681 z = x * two_over_pi;
3684 mask = cvtIB2B((iy & ione) == ione);
3686 x = fnma(y, argred0, x);
3687 x = fnma(y, argred1, x);
3688 x = fnma(y, argred2, x);
3689 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
3691 const SimdDouble quarter(0.25);
3692 const SimdDouble half(0.5);
3693 const SimdDouble threequarter(0.75);
3695 SimdDBool m1, m2, m3;
3698 z = fma(w, two_over_pi, half);
3702 m1 = (quarter <= q);
3704 m3 = (threequarter <= q);
3707 w = fnma(y, argred0, w);
3708 w = fnma(y, argred1, w);
3709 w = fnma(y, argred2, w);
3711 w = blend(w, -w, mask);
3712 x = w * copysign( SimdDouble(1.0), x );
3715 p = fma(CT6, x2, CT5);
3716 p = fma(p, x2, CT4);
3717 p = fma(p, x2, CT3);
3718 p = fma(p, x2, CT2);
3719 p = fma(p, x2, CT1);
3720 p = fma(x2, p * x, x);
3722 p = blend( p, maskzInvSingleAccuracy(p, mask), mask);
3726 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3728 * \param x The argument to evaluate asin for
3731 static inline SimdDouble gmx_simdcall
3732 asinSingleAccuracy(SimdDouble x)
3734 const SimdDouble limitlow(1e-4);
3735 const SimdDouble half(0.5);
3736 const SimdDouble one(1.0);
3737 const SimdDouble halfpi(M_PI/2.0);
3738 const SimdDouble CC5(4.2163199048E-2);
3739 const SimdDouble CC4(2.4181311049E-2);
3740 const SimdDouble CC3(4.5470025998E-2);
3741 const SimdDouble CC2(7.4953002686E-2);
3742 const SimdDouble CC1(1.6666752422E-1);
3744 SimdDouble z, z1, z2, q, q1, q2;
3746 SimdDBool mask, mask2;
3749 mask = (half < xabs);
3750 z1 = half * (one - xabs);
3751 mask2 = (xabs < one);
3752 q1 = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
3755 z = blend(z2, z1, mask);
3756 q = blend(q2, q1, mask);
3759 pA = fma(CC5, z2, CC3);
3760 pB = fma(CC4, z2, CC2);
3761 pA = fma(pA, z2, CC1);
3763 z = fma(pB, z2, pA);
3767 z = blend(z, q2, mask);
3769 mask = (limitlow < xabs);
3770 z = blend( xabs, z, mask );
3776 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3778 * \param x The argument to evaluate acos for
3781 static inline SimdDouble gmx_simdcall
3782 acosSingleAccuracy(SimdDouble x)
3784 const SimdDouble one(1.0);
3785 const SimdDouble half(0.5);
3786 const SimdDouble pi(M_PI);
3787 const SimdDouble halfpi(M_PI/2.0);
3789 SimdDouble z, z1, z2, z3;
3790 SimdDBool mask1, mask2, mask3;
3793 mask1 = (half < xabs);
3794 mask2 = (setZero() < x);
3796 z = half * (one - xabs);
3797 mask3 = (xabs < one);
3798 z = z * maskzInvsqrtSingleAccuracy(z, mask3);
3799 z = blend(x, z, mask1);
3800 z = asinSingleAccuracy(z);
3805 z = blend(z1, z2, mask2);
3806 z = blend(z3, z, mask1);
3811 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3813 * \param x The argument to evaluate atan for
3814 * \result Atan(x), same argument/value range as standard math library.
3816 static inline SimdDouble gmx_simdcall
3817 atanSingleAccuracy(SimdDouble x)
3819 const SimdDouble halfpi(M_PI/2);
3820 const SimdDouble CA17(0.002823638962581753730774);
3821 const SimdDouble CA15(-0.01595690287649631500244);
3822 const SimdDouble CA13(0.04250498861074447631836);
3823 const SimdDouble CA11(-0.07489009201526641845703);
3824 const SimdDouble CA9(0.1063479334115982055664);
3825 const SimdDouble CA7(-0.1420273631811141967773);
3826 const SimdDouble CA5(0.1999269574880599975585);
3827 const SimdDouble CA3(-0.3333310186862945556640);
3828 SimdDouble x2, x3, x4, pA, pB;
3829 SimdDBool mask, mask2;
3831 mask = (x < setZero());
3833 mask2 = (SimdDouble(1.0) < x);
3834 x = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
3839 pA = fma(CA17, x4, CA13);
3840 pB = fma(CA15, x4, CA11);
3841 pA = fma(pA, x4, CA9);
3842 pB = fma(pB, x4, CA7);
3843 pA = fma(pA, x4, CA5);
3844 pB = fma(pB, x4, CA3);
3845 pA = fma(pA, x2, pB);
3846 pA = fma(pA, x3, x);
3848 pA = blend(pA, halfpi - pA, mask2);
3849 pA = blend(pA, -pA, mask);
3854 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3856 * \param y Y component of vector, any quartile
3857 * \param x X component of vector, any quartile
3858 * \result Atan(y,x), same argument/value range as standard math library.
3860 * \note This routine should provide correct results for all finite
3861 * non-zero or positive-zero arguments. However, negative zero arguments will
3862 * be treated as positive zero, which means the return value will deviate from
3863 * the standard math library atan2(y,x) for those cases. That should not be
3864 * of any concern in Gromacs, and in particular it will not affect calculations
3865 * of angles from vectors.
3867 static inline SimdDouble gmx_simdcall
3868 atan2SingleAccuracy(SimdDouble y, SimdDouble x)
3870 const SimdDouble pi(M_PI);
3871 const SimdDouble halfpi(M_PI/2.0);
3872 SimdDouble xinv, p, aoffset;
3873 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3875 mask_xnz = x != setZero();
3876 mask_ynz = y != setZero();
3877 mask_xlt0 = (x < setZero());
3878 mask_ylt0 = (y < setZero());
3880 aoffset = selectByNotMask(halfpi, mask_xnz);
3881 aoffset = selectByMask(aoffset, mask_ynz);
3883 aoffset = blend(aoffset, pi, mask_xlt0);
3884 aoffset = blend(aoffset, -aoffset, mask_ylt0);
3886 xinv = maskzInvSingleAccuracy(x, mask_xnz);
3888 p = atanSingleAccuracy(p);
3894 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3896 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3897 * \result Correction factor to coulomb force - see below for details.
3899 * This routine is meant to enable analytical evaluation of the
3900 * direct-space PME electrostatic force to avoid tables.
3902 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3903 * are some problems evaluating that:
3905 * First, the error function is difficult (read: expensive) to
3906 * approxmiate accurately for intermediate to large arguments, and
3907 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3908 * Second, we now try to avoid calculating potentials in Gromacs but
3909 * use forces directly.
3911 * We can simply things slight by noting that the PME part is really
3912 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3914 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3916 * The first term we already have from the inverse square root, so
3917 * that we can leave out of this routine.
3919 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3920 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3921 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3924 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3925 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3926 * then only use even powers. This is another minor optimization, since
3927 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3928 * the vector between the two atoms to get the vectorial force. The
3929 * fastest flops are the ones we can avoid calculating!
3931 * So, here's how it should be used:
3933 * 1. Calculate \f$r^2\f$.
3934 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3935 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3936 * 4. The return value is the expression:
3939 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3942 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3945 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3948 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3951 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3954 * With a bit of math exercise you should be able to confirm that
3958 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3961 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3962 * and you have your force (divided by \f$r\f$). A final multiplication
3963 * with the vector connecting the two particles and you have your
3964 * vectorial force to add to the particles.
3966 * This approximation achieves an accuracy slightly lower than 1e-6; when
3967 * added to \f$1/r\f$ the error will be insignificant.
3970 static inline SimdDouble gmx_simdcall
3971 pmeForceCorrectionSingleAccuracy(SimdDouble z2)
3973 const SimdDouble FN6(-1.7357322914161492954e-8);
3974 const SimdDouble FN5(1.4703624142580877519e-6);
3975 const SimdDouble FN4(-0.000053401640219807709149);
3976 const SimdDouble FN3(0.0010054721316683106153);
3977 const SimdDouble FN2(-0.019278317264888380590);
3978 const SimdDouble FN1(0.069670166153766424023);
3979 const SimdDouble FN0(-0.75225204789749321333);
3981 const SimdDouble FD4(0.0011193462567257629232);
3982 const SimdDouble FD3(0.014866955030185295499);
3983 const SimdDouble FD2(0.11583842382862377919);
3984 const SimdDouble FD1(0.50736591960530292870);
3985 const SimdDouble FD0(1.0);
3988 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3992 polyFD0 = fma(FD4, z4, FD2);
3993 polyFD1 = fma(FD3, z4, FD1);
3994 polyFD0 = fma(polyFD0, z4, FD0);
3995 polyFD0 = fma(polyFD1, z2, polyFD0);
3997 polyFD0 = invSingleAccuracy(polyFD0);
3999 polyFN0 = fma(FN6, z4, FN4);
4000 polyFN1 = fma(FN5, z4, FN3);
4001 polyFN0 = fma(polyFN0, z4, FN2);
4002 polyFN1 = fma(polyFN1, z4, FN1);
4003 polyFN0 = fma(polyFN0, z4, FN0);
4004 polyFN0 = fma(polyFN1, z2, polyFN0);
4006 return polyFN0 * polyFD0;
4011 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4013 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4014 * \result Correction factor to coulomb potential - see below for details.
4016 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4017 * as the input argument.
4019 * Here's how it should be used:
4021 * 1. Calculate \f$r^2\f$.
4022 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4023 * 3. Evaluate this routine with z^2 as the argument.
4024 * 4. The return value is the expression:
4027 * \frac{\mbox{erf}(z)}{z}
4030 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4033 * \frac{\mbox{erf}(r \beta)}{r}
4036 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4037 * and you have your potential.
4039 * This approximation achieves an accuracy slightly lower than 1e-6; when
4040 * added to \f$1/r\f$ the error will be insignificant.
4042 static inline SimdDouble gmx_simdcall
4043 pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4045 const SimdDouble VN6(1.9296833005951166339e-8);
4046 const SimdDouble VN5(-1.4213390571557850962e-6);
4047 const SimdDouble VN4(0.000041603292906656984871);
4048 const SimdDouble VN3(-0.00013134036773265025626);
4049 const SimdDouble VN2(0.038657983986041781264);
4050 const SimdDouble VN1(0.11285044772717598220);
4051 const SimdDouble VN0(1.1283802385263030286);
4053 const SimdDouble VD3(0.0066752224023576045451);
4054 const SimdDouble VD2(0.078647795836373922256);
4055 const SimdDouble VD1(0.43336185284710920150);
4056 const SimdDouble VD0(1.0);
4059 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4063 polyVD1 = fma(VD3, z4, VD1);
4064 polyVD0 = fma(VD2, z4, VD0);
4065 polyVD0 = fma(polyVD1, z2, polyVD0);
4067 polyVD0 = invSingleAccuracy(polyVD0);
4069 polyVN0 = fma(VN6, z4, VN4);
4070 polyVN1 = fma(VN5, z4, VN3);
4071 polyVN0 = fma(polyVN0, z4, VN2);
4072 polyVN1 = fma(polyVN1, z4, VN1);
4073 polyVN0 = fma(polyVN0, z4, VN0);
4074 polyVN0 = fma(polyVN1, z2, polyVN0);
4076 return polyVN0 * polyVD0;
4082 /*! \name SIMD4 math functions
4084 * \note Only a subset of the math functions are implemented for SIMD4.
4089 #if GMX_SIMD4_HAVE_FLOAT
4091 /*************************************************************************
4092 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4093 *************************************************************************/
4095 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4097 * This is a low-level routine that should only be used by SIMD math routine
4098 * that evaluates the inverse square root.
4100 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4101 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4102 * \return An improved approximation with roughly twice as many bits of accuracy.
4104 static inline Simd4Float gmx_simdcall
4105 rsqrtIter(Simd4Float lu, Simd4Float x)
4107 Simd4Float tmp1 = x*lu;
4108 Simd4Float tmp2 = Simd4Float(-0.5f)*lu;
4109 tmp1 = fma(tmp1, lu, Simd4Float(-3.0f));
4113 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4115 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4116 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4117 * For the single precision implementation this is obviously always
4118 * true for positive values, but for double precision it adds an
4119 * extra restriction since the first lookup step might have to be
4120 * performed in single precision on some architectures. Note that the
4121 * responsibility for checking falls on you - this routine does not
4123 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4125 static inline Simd4Float gmx_simdcall
4126 invsqrt(Simd4Float x)
4128 Simd4Float lu = rsqrt(x);
4129 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4130 lu = rsqrtIter(lu, x);
4132 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4133 lu = rsqrtIter(lu, x);
4135 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4136 lu = rsqrtIter(lu, x);
4142 #endif // GMX_SIMD4_HAVE_FLOAT
4146 #if GMX_SIMD4_HAVE_DOUBLE
4147 /*************************************************************************
4148 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4149 *************************************************************************/
4151 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4153 * This is a low-level routine that should only be used by SIMD math routine
4154 * that evaluates the inverse square root.
4156 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4157 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4158 * \return An improved approximation with roughly twice as many bits of accuracy.
4160 static inline Simd4Double gmx_simdcall
4161 rsqrtIter(Simd4Double lu, Simd4Double x)
4163 Simd4Double tmp1 = x*lu;
4164 Simd4Double tmp2 = Simd4Double(-0.5f)*lu;
4165 tmp1 = fma(tmp1, lu, Simd4Double(-3.0f));
4169 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4171 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4172 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4173 * For the single precision implementation this is obviously always
4174 * true for positive values, but for double precision it adds an
4175 * extra restriction since the first lookup step might have to be
4176 * performed in single precision on some architectures. Note that the
4177 * responsibility for checking falls on you - this routine does not
4179 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4181 static inline Simd4Double gmx_simdcall
4182 invsqrt(Simd4Double x)
4184 Simd4Double lu = rsqrt(x);
4185 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4186 lu = rsqrtIter(lu, x);
4188 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4189 lu = rsqrtIter(lu, x);
4191 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4192 lu = rsqrtIter(lu, x);
4194 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4195 lu = rsqrtIter(lu, x);
4201 /**********************************************************************
4202 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4203 **********************************************************************/
4205 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4207 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4208 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4209 * For the single precision implementation this is obviously always
4210 * true for positive values, but for double precision it adds an
4211 * extra restriction since the first lookup step might have to be
4212 * performed in single precision on some architectures. Note that the
4213 * responsibility for checking falls on you - this routine does not
4215 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4217 static inline Simd4Double gmx_simdcall
4218 invsqrtSingleAccuracy(Simd4Double x)
4220 Simd4Double lu = rsqrt(x);
4221 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4222 lu = rsqrtIter(lu, x);
4224 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4225 lu = rsqrtIter(lu, x);
4227 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4228 lu = rsqrtIter(lu, x);
4235 #endif // GMX_SIMD4_HAVE_DOUBLE
4239 #if GMX_SIMD_HAVE_FLOAT
4240 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4242 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4243 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4244 * For the single precision implementation this is obviously always
4245 * true for positive values, but for double precision it adds an
4246 * extra restriction since the first lookup step might have to be
4247 * performed in single precision on some architectures. Note that the
4248 * responsibility for checking falls on you - this routine does not
4250 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4252 static inline SimdFloat gmx_simdcall
4253 invsqrtSingleAccuracy(SimdFloat x)
4258 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4260 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4261 * Illegal values in the masked-out elements will not lead to
4262 * floating-point exceptions.
4264 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4265 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4266 * For the single precision implementation this is obviously always
4267 * true for positive values, but for double precision it adds an
4268 * extra restriction since the first lookup step might have to be
4269 * performed in single precision on some architectures. Note that the
4270 * responsibility for checking falls on you - this routine does not
4273 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
4274 * entry was not masked, and 0.0 for masked-out entries.
4276 static inline SimdFloat
4277 maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4279 return maskzInvsqrt(x, m);
4282 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4284 * \param x0 First set of arguments, x0 must be in single range (see below).
4285 * \param x1 Second set of arguments, x1 must be in single range (see below).
4286 * \param[out] out0 Result 1/sqrt(x0)
4287 * \param[out] out1 Result 1/sqrt(x1)
4289 * In particular for double precision we can sometimes calculate square root
4290 * pairs slightly faster by using single precision until the very last step.
4292 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4293 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4294 * For the single precision implementation this is obviously always
4295 * true for positive values, but for double precision it adds an
4296 * extra restriction since the first lookup step might have to be
4297 * performed in single precision on some architectures. Note that the
4298 * responsibility for checking falls on you - this routine does not
4301 static inline void gmx_simdcall
4302 invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1,
4303 SimdFloat *out0, SimdFloat *out1)
4305 return invsqrtPair(x0, x1, out0, out1);
4308 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4310 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4311 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4312 * For the single precision implementation this is obviously always
4313 * true for positive values, but for double precision it adds an
4314 * extra restriction since the first lookup step might have to be
4315 * performed in single precision on some architectures. Note that the
4316 * responsibility for checking falls on you - this routine does not
4318 * \return 1/x. Result is undefined if your argument was invalid.
4320 static inline SimdFloat gmx_simdcall
4321 invSingleAccuracy(SimdFloat x)
4327 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4329 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4330 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4331 * For the single precision implementation this is obviously always
4332 * true for positive values, but for double precision it adds an
4333 * extra restriction since the first lookup step might have to be
4334 * performed in single precision on some architectures. Note that the
4335 * responsibility for checking falls on you - this routine does not
4338 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
4340 static inline SimdFloat
4341 maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4343 return maskzInv(x, m);
4346 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4348 * \copydetails sqrt(SimdFloat)
4350 template <MathOptimization opt = MathOptimization::Safe>
4351 static inline SimdFloat gmx_simdcall
4352 sqrtSingleAccuracy(SimdFloat x)
4354 return sqrt<opt>(x);
4357 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
4359 * \param x Argument, should be >0.
4360 * \result The natural logarithm of x. Undefined if argument is invalid.
4362 static inline SimdFloat gmx_simdcall
4363 logSingleAccuracy(SimdFloat x)
4368 /*! \brief SIMD float 2^x, only targeting single accuracy.
4370 * \copydetails exp2(SimdFloat)
4372 template <MathOptimization opt = MathOptimization::Safe>
4373 static inline SimdFloat gmx_simdcall
4374 exp2SingleAccuracy(SimdFloat x)
4376 return exp2<opt>(x);
4379 /*! \brief SIMD float e^x, only targeting single accuracy.
4381 * \copydetails exp(SimdFloat)
4383 template <MathOptimization opt = MathOptimization::Safe>
4384 static inline SimdFloat gmx_simdcall
4385 expSingleAccuracy(SimdFloat x)
4391 /*! \brief SIMD float erf(x), only targeting single accuracy.
4393 * \param x The value to calculate erf(x) for.
4396 * This routine achieves very close to single precision, but we do not care about
4397 * the last bit or the subnormal result range.
4399 static inline SimdFloat gmx_simdcall
4400 erfSingleAccuracy(SimdFloat x)
4405 /*! \brief SIMD float erfc(x), only targeting single accuracy.
4407 * \param x The value to calculate erfc(x) for.
4410 * This routine achieves singleprecision (bar the last bit) over most of the
4411 * input range, but for large arguments where the result is getting close
4412 * to the minimum representable numbers we accept slightly larger errors
4413 * (think results that are in the ballpark of 10^-30) since that is not
4416 static inline SimdFloat gmx_simdcall
4417 erfcSingleAccuracy(SimdFloat x)
4422 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
4424 * \param x The argument to evaluate sin/cos for
4425 * \param[out] sinval Sin(x)
4426 * \param[out] cosval Cos(x)
4428 static inline void gmx_simdcall
4429 sinCosSingleAccuracy(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
4431 sincos(x, sinval, cosval);
4434 /*! \brief SIMD float sin(x), only targeting single accuracy.
4436 * \param x The argument to evaluate sin for
4439 * \attention Do NOT call both sin & cos if you need both results, since each of them
4440 * will then call \ref sincos and waste a factor 2 in performance.
4442 static inline SimdFloat gmx_simdcall
4443 sinSingleAccuracy(SimdFloat x)
4448 /*! \brief SIMD float cos(x), only targeting single accuracy.
4450 * \param x The argument to evaluate cos for
4453 * \attention Do NOT call both sin & cos if you need both results, since each of them
4454 * will then call \ref sincos and waste a factor 2 in performance.
4456 static inline SimdFloat gmx_simdcall
4457 cosSingleAccuracy(SimdFloat x)
4462 /*! \brief SIMD float tan(x), only targeting single accuracy.
4464 * \param x The argument to evaluate tan for
4467 static inline SimdFloat gmx_simdcall
4468 tanSingleAccuracy(SimdFloat x)
4473 /*! \brief SIMD float asin(x), only targeting single accuracy.
4475 * \param x The argument to evaluate asin for
4478 static inline SimdFloat gmx_simdcall
4479 asinSingleAccuracy(SimdFloat x)
4484 /*! \brief SIMD float acos(x), only targeting single accuracy.
4486 * \param x The argument to evaluate acos for
4489 static inline SimdFloat gmx_simdcall
4490 acosSingleAccuracy(SimdFloat x)
4495 /*! \brief SIMD float atan(x), only targeting single accuracy.
4497 * \param x The argument to evaluate atan for
4498 * \result Atan(x), same argument/value range as standard math library.
4500 static inline SimdFloat gmx_simdcall
4501 atanSingleAccuracy(SimdFloat x)
4506 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
4508 * \param y Y component of vector, any quartile
4509 * \param x X component of vector, any quartile
4510 * \result Atan(y,x), same argument/value range as standard math library.
4512 * \note This routine should provide correct results for all finite
4513 * non-zero or positive-zero arguments. However, negative zero arguments will
4514 * be treated as positive zero, which means the return value will deviate from
4515 * the standard math library atan2(y,x) for those cases. That should not be
4516 * of any concern in Gromacs, and in particular it will not affect calculations
4517 * of angles from vectors.
4519 static inline SimdFloat gmx_simdcall
4520 atan2SingleAccuracy(SimdFloat y, SimdFloat x)
4525 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
4527 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4528 * \result Correction factor to coulomb force.
4530 static inline SimdFloat gmx_simdcall
4531 pmeForceCorrectionSingleAccuracy(SimdFloat z2)
4533 return pmeForceCorrection(z2);
4536 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
4538 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4539 * \result Correction factor to coulomb force.
4541 static inline SimdFloat gmx_simdcall
4542 pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
4544 return pmePotentialCorrection(z2);
4546 #endif // GMX_SIMD_HAVE_FLOAT
4548 #if GMX_SIMD4_HAVE_FLOAT
4549 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
4551 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4552 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4553 * For the single precision implementation this is obviously always
4554 * true for positive values, but for double precision it adds an
4555 * extra restriction since the first lookup step might have to be
4556 * performed in single precision on some architectures. Note that the
4557 * responsibility for checking falls on you - this routine does not
4559 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4561 static inline Simd4Float gmx_simdcall
4562 invsqrtSingleAccuracy(Simd4Float x)
4566 #endif // GMX_SIMD4_HAVE_FLOAT
4568 /*! \} end of addtogroup module_simd */
4569 /*! \endcond end of condition libabl */
4575 #endif // GMX_SIMD_SIMD_MATH_H