2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H
36 #define GMX_SIMD_SIMD_MATH_H
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
62 #include "gromacs/math/utilities.h"
63 #include "gromacs/simd/simd.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/real.h"
73 /*! \addtogroup module_simd */
76 /*! \name Implementation accuracy settings
82 #if GMX_SIMD_HAVE_FLOAT
84 /*! \name Single precision SIMD math functions
86 * \note In most cases you should use the real-precision functions instead.
90 /****************************************
91 * SINGLE PRECISION SIMD MATH FUNCTIONS *
92 ****************************************/
94 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
95 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
97 * \param x Values to set sign for
98 * \param y Values used to set sign
99 * \return Magnitude of x, sign of y
101 static inline SimdFloat gmx_simdcall
102 copysign(SimdFloat x, SimdFloat y)
104 #if GMX_SIMD_HAVE_LOGICAL
105 return abs(x) | ( SimdFloat(GMX_FLOAT_NEGZERO) & y );
107 return blend(abs(x), -abs(x), y < setZero());
112 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
113 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
115 * This is a low-level routine that should only be used by SIMD math routine
116 * that evaluates the inverse square root.
118 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
119 * \param x The reference (starting) value x for which we want 1/sqrt(x).
120 * \return An improved approximation with roughly twice as many bits of accuracy.
122 static inline SimdFloat gmx_simdcall
123 rsqrtIter(SimdFloat lu, SimdFloat x)
125 SimdFloat tmp1 = x*lu;
126 SimdFloat tmp2 = SimdFloat(-0.5f)*lu;
127 tmp1 = fma(tmp1, lu, SimdFloat(-3.0f));
132 /*! \brief Calculate 1/sqrt(x) for SIMD float.
134 * \param x Argument that must be >0. This routine does not check arguments.
135 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
137 static inline SimdFloat gmx_simdcall
140 SimdFloat lu = rsqrt(x);
141 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
142 lu = rsqrtIter(lu, x);
144 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
145 lu = rsqrtIter(lu, x);
147 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
148 lu = rsqrtIter(lu, x);
153 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
155 * \param x0 First set of arguments, x0 must be positive - no argument checking.
156 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
157 * \param[out] out0 Result 1/sqrt(x0)
158 * \param[out] out1 Result 1/sqrt(x1)
160 * In particular for double precision we can sometimes calculate square root
161 * pairs slightly faster by using single precision until the very last step.
163 static inline void gmx_simdcall
164 invsqrtPair(SimdFloat x0, SimdFloat x1,
165 SimdFloat *out0, SimdFloat *out1)
171 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
172 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
174 * This is a low-level routine that should only be used by SIMD math routine
175 * that evaluates the reciprocal.
177 * \param lu Approximation of 1/x, typically obtained from lookup.
178 * \param x The reference (starting) value x for which we want 1/x.
179 * \return An improved approximation with roughly twice as many bits of accuracy.
181 static inline SimdFloat gmx_simdcall
182 rcpIter(SimdFloat lu, SimdFloat x)
184 return lu*fnma(lu, x, SimdFloat(2.0f));
188 /*! \brief Calculate 1/x for SIMD float.
190 * \param x Argument that must be nonzero. This routine does not check arguments.
191 * \return 1/x. Result is undefined if your argument was invalid.
193 static inline SimdFloat gmx_simdcall
196 SimdFloat lu = rcp(x);
197 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
200 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
203 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
209 /*! \brief Division for SIMD floats
211 * \param nom Nominator
212 * \param denom Denominator
216 * \note This function does not use any masking to avoid problems with
217 * zero values in the denominator.
219 static inline SimdFloat gmx_simdcall
220 operator/(SimdFloat nom, SimdFloat denom)
222 return nom*inv(denom);
225 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
227 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
228 * Illegal values in the masked-out elements will not lead to
229 * floating-point exceptions.
231 * \param x Argument that must be >0 for masked-in entries
233 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
234 * entry was not masked, and 0.0 for masked-out entries.
236 static inline SimdFloat
237 maskzInvsqrt(SimdFloat x, SimdFBool m)
239 SimdFloat lu = maskzRsqrt(x, m);
240 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
241 lu = rsqrtIter(lu, x);
243 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
244 lu = rsqrtIter(lu, x);
246 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
247 lu = rsqrtIter(lu, x);
252 /*! \brief Calculate 1/x for SIMD float, masked version.
254 * \param x Argument that must be nonzero for non-masked entries.
256 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
258 static inline SimdFloat gmx_simdcall
259 maskzInv(SimdFloat x, SimdFBool m)
261 SimdFloat lu = maskzRcp(x, m);
262 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
265 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
268 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
274 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
276 * \param x Argument that must be >=0.
277 * \return sqrt(x). If x=0, the result will correctly be set to 0.
278 * The result is undefined if the input value is negative.
280 static inline SimdFloat gmx_simdcall
283 SimdFloat res = maskzInvsqrt(x, setZero() < x);
287 #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
288 /*! \brief SIMD float log(x). This is the natural logarithm.
290 * \param x Argument, should be >0.
291 * \result The natural logarithm of x. Undefined if argument is invalid.
293 static inline SimdFloat gmx_simdcall
296 const SimdFloat one(1.0f);
297 const SimdFloat two(2.0f);
298 const SimdFloat invsqrt2(1.0f/std::sqrt(2.0f));
299 const SimdFloat corr(0.693147180559945286226764f);
300 const SimdFloat CL9(0.2371599674224853515625f);
301 const SimdFloat CL7(0.285279005765914916992188f);
302 const SimdFloat CL5(0.400005519390106201171875f);
303 const SimdFloat CL3(0.666666567325592041015625f);
304 const SimdFloat CL1(2.0f);
305 SimdFloat fExp, x2, p;
313 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
314 fExp = fExp - selectByMask(one, m);
315 x = x * blend(one, two, m);
317 x = (x-one) * inv( x+one );
320 p = fma(CL9, x2, CL7);
324 p = fma(p, x, corr*fExp);
330 #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
331 /*! \brief SIMD float 2^x.
334 * \result 2^x. Undefined if input argument caused overflow.
336 static inline SimdFloat gmx_simdcall
339 // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
340 const SimdFloat arglimit(126.0f);
341 const SimdFloat CC6(0.0001534581200287996416911311f);
342 const SimdFloat CC5(0.001339993121934088894618990f);
343 const SimdFloat CC4(0.009618488957115180159497841f);
344 const SimdFloat CC3(0.05550328776964726865751735f);
345 const SimdFloat CC2(0.2402264689063408646490722f);
346 const SimdFloat CC1(0.6931472057372680777553816f);
347 const SimdFloat one(1.0f);
354 fexppart = ldexp(one, cvtR2I(x));
356 m = abs(x) <= arglimit;
357 fexppart = selectByMask(fexppart, m);
360 p = fma(CC6, x, CC5);
371 #if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
372 /*! \brief SIMD float exp(x).
374 * In addition to scaling the argument for 2^x this routine correctly does
375 * extended precision arithmetics to improve accuracy.
378 * \result exp(x). Undefined if input argument caused overflow,
379 * which can happen if abs(x) \> 7e13.
381 static inline SimdFloat gmx_simdcall
384 const SimdFloat argscale(1.44269504088896341f);
385 // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
386 const SimdFloat arglimit(126.0f);
387 const SimdFloat invargscale0(-0.693145751953125f);
388 const SimdFloat invargscale1(-1.428606765330187045e-06f);
389 const SimdFloat CC4(0.00136324646882712841033936f);
390 const SimdFloat CC3(0.00836596917361021041870117f);
391 const SimdFloat CC2(0.0416710823774337768554688f);
392 const SimdFloat CC1(0.166665524244308471679688f);
393 const SimdFloat CC0(0.499999850988388061523438f);
394 const SimdFloat one(1.0f);
401 fexppart = ldexp(one, cvtR2I(y));
403 m = (abs(y) <= arglimit);
404 fexppart = selectByMask(fexppart, m);
406 // Extended precision arithmetics
407 x = fma(invargscale0, intpart, x);
408 x = fma(invargscale1, intpart, x);
410 p = fma(CC4, x, CC3);
415 x = fma(p, fexppart, fexppart);
420 /*! \brief SIMD float erf(x).
422 * \param x The value to calculate erf(x) for.
425 * This routine achieves very close to full precision, but we do not care about
426 * the last bit or the subnormal result range.
428 static inline SimdFloat gmx_simdcall
431 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
432 const SimdFloat CA6(7.853861353153693e-5f);
433 const SimdFloat CA5(-8.010193625184903e-4f);
434 const SimdFloat CA4(5.188327685732524e-3f);
435 const SimdFloat CA3(-2.685381193529856e-2f);
436 const SimdFloat CA2(1.128358514861418e-1f);
437 const SimdFloat CA1(-3.761262582423300e-1f);
438 const SimdFloat CA0(1.128379165726710f);
439 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
440 const SimdFloat CB9(-0.0018629930017603923f);
441 const SimdFloat CB8(0.003909821287598495f);
442 const SimdFloat CB7(-0.0052094582210355615f);
443 const SimdFloat CB6(0.005685614362160572f);
444 const SimdFloat CB5(-0.0025367682853477272f);
445 const SimdFloat CB4(-0.010199799682318782f);
446 const SimdFloat CB3(0.04369575504816542f);
447 const SimdFloat CB2(-0.11884063474674492f);
448 const SimdFloat CB1(0.2732120154030589f);
449 const SimdFloat CB0(0.42758357702025784f);
450 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
451 const SimdFloat CC10(-0.0445555913112064f);
452 const SimdFloat CC9(0.21376355144663348f);
453 const SimdFloat CC8(-0.3473187200259257f);
454 const SimdFloat CC7(0.016690861551248114f);
455 const SimdFloat CC6(0.7560973182491192f);
456 const SimdFloat CC5(-1.2137903600145787f);
457 const SimdFloat CC4(0.8411872321232948f);
458 const SimdFloat CC3(-0.08670413896296343f);
459 const SimdFloat CC2(-0.27124782687240334f);
460 const SimdFloat CC1(-0.0007502488047806069f);
461 const SimdFloat CC0(0.5642114853803148f);
462 const SimdFloat one(1.0f);
463 const SimdFloat two(2.0f);
466 SimdFloat t, t2, w, w2;
467 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
469 SimdFloat res_erf, res_erfc, res;
470 SimdFBool m, maskErf;
476 pA0 = fma(CA6, x4, CA4);
477 pA1 = fma(CA5, x4, CA3);
478 pA0 = fma(pA0, x4, CA2);
479 pA1 = fma(pA1, x4, CA1);
481 pA0 = fma(pA1, x2, pA0);
482 // Constant term must come last for precision reasons
489 maskErf = SimdFloat(0.75f) <= y;
490 t = maskzInv(y, maskErf);
495 // No need for a floating-point sieve here (as in erfc), since erf()
496 // will never return values that are extremely small for large args.
497 expmx2 = exp( -y*y );
499 pB1 = fma(CB9, w2, CB7);
500 pB0 = fma(CB8, w2, CB6);
501 pB1 = fma(pB1, w2, CB5);
502 pB0 = fma(pB0, w2, CB4);
503 pB1 = fma(pB1, w2, CB3);
504 pB0 = fma(pB0, w2, CB2);
505 pB1 = fma(pB1, w2, CB1);
506 pB0 = fma(pB0, w2, CB0);
507 pB0 = fma(pB1, w, pB0);
509 pC0 = fma(CC10, t2, CC8);
510 pC1 = fma(CC9, t2, CC7);
511 pC0 = fma(pC0, t2, CC6);
512 pC1 = fma(pC1, t2, CC5);
513 pC0 = fma(pC0, t2, CC4);
514 pC1 = fma(pC1, t2, CC3);
515 pC0 = fma(pC0, t2, CC2);
516 pC1 = fma(pC1, t2, CC1);
518 pC0 = fma(pC0, t2, CC0);
519 pC0 = fma(pC1, t, pC0);
522 // Select pB0 or pC0 for erfc()
524 res_erfc = blend(pB0, pC0, m);
525 res_erfc = res_erfc * expmx2;
527 // erfc(x<0) = 2-erfc(|x|)
529 res_erfc = blend(res_erfc, two-res_erfc, m);
531 // Select erf() or erfc()
532 res = blend(res_erf, one-res_erfc, maskErf);
537 /*! \brief SIMD float erfc(x).
539 * \param x The value to calculate erfc(x) for.
542 * This routine achieves full precision (bar the last bit) over most of the
543 * input range, but for large arguments where the result is getting close
544 * to the minimum representable numbers we accept slightly larger errors
545 * (think results that are in the ballpark of 10^-30 for single precision)
546 * since that is not relevant for MD.
548 static inline SimdFloat gmx_simdcall
551 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
552 const SimdFloat CA6(7.853861353153693e-5f);
553 const SimdFloat CA5(-8.010193625184903e-4f);
554 const SimdFloat CA4(5.188327685732524e-3f);
555 const SimdFloat CA3(-2.685381193529856e-2f);
556 const SimdFloat CA2(1.128358514861418e-1f);
557 const SimdFloat CA1(-3.761262582423300e-1f);
558 const SimdFloat CA0(1.128379165726710f);
559 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
560 const SimdFloat CB9(-0.0018629930017603923f);
561 const SimdFloat CB8(0.003909821287598495f);
562 const SimdFloat CB7(-0.0052094582210355615f);
563 const SimdFloat CB6(0.005685614362160572f);
564 const SimdFloat CB5(-0.0025367682853477272f);
565 const SimdFloat CB4(-0.010199799682318782f);
566 const SimdFloat CB3(0.04369575504816542f);
567 const SimdFloat CB2(-0.11884063474674492f);
568 const SimdFloat CB1(0.2732120154030589f);
569 const SimdFloat CB0(0.42758357702025784f);
570 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
571 const SimdFloat CC10(-0.0445555913112064f);
572 const SimdFloat CC9(0.21376355144663348f);
573 const SimdFloat CC8(-0.3473187200259257f);
574 const SimdFloat CC7(0.016690861551248114f);
575 const SimdFloat CC6(0.7560973182491192f);
576 const SimdFloat CC5(-1.2137903600145787f);
577 const SimdFloat CC4(0.8411872321232948f);
578 const SimdFloat CC3(-0.08670413896296343f);
579 const SimdFloat CC2(-0.27124782687240334f);
580 const SimdFloat CC1(-0.0007502488047806069f);
581 const SimdFloat CC0(0.5642114853803148f);
582 // Coefficients for expansion of exp(x) in [0,0.1]
583 // CD0 and CD1 are both 1.0, so no need to declare them separately
584 const SimdFloat CD2(0.5000066608081202f);
585 const SimdFloat CD3(0.1664795422874624f);
586 const SimdFloat CD4(0.04379839977652482f);
587 const SimdFloat one(1.0f);
588 const SimdFloat two(2.0f);
590 /* We need to use a small trick here, since we cannot assume all SIMD
591 * architectures support integers, and the flag we want (0xfffff000) would
592 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
593 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
594 * fp numbers, and perform a logical or. Since the expression is constant,
595 * we can at least hope it is evaluated at compile-time.
597 #if GMX_SIMD_HAVE_LOGICAL
598 const SimdFloat sieve(SimdFloat(-5.965323564e+29f) | SimdFloat(7.05044434e-30f));
600 const int isieve = 0xFFFFF000;
601 GMX_ALIGNED(float, GMX_SIMD_FLOAT_WIDTH) mem[GMX_SIMD_FLOAT_WIDTH];
610 SimdFloat q, z, t, t2, w, w2;
611 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
612 SimdFloat expmx2, corr;
613 SimdFloat res_erf, res_erfc, res;
614 SimdFBool m, msk_erf;
620 pA0 = fma(CA6, x4, CA4);
621 pA1 = fma(CA5, x4, CA3);
622 pA0 = fma(pA0, x4, CA2);
623 pA1 = fma(pA1, x4, CA1);
625 pA0 = fma(pA0, x4, pA1);
626 // Constant term must come last for precision reasons
633 msk_erf = SimdFloat(0.75f) <= y;
634 t = maskzInv(y, msk_erf);
639 * We cannot simply calculate exp(-y2) directly in single precision, since
640 * that will lose a couple of bits of precision due to the multiplication.
641 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
642 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
644 * The only drawback with this is that it requires TWO separate exponential
645 * evaluations, which would be horrible performance-wise. However, the argument
646 * for the second exp() call is always small, so there we simply use a
647 * low-order minimax expansion on [0,0.1].
649 * However, this neat idea requires support for logical ops (and) on
650 * FP numbers, which some vendors decided isn't necessary in their SIMD
651 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
652 * in double, but we still need memory as a backup when that is not available,
653 * and this case is rare enough that we go directly there...
655 #if GMX_SIMD_HAVE_LOGICAL
659 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
662 conv.i = conv.i & isieve;
668 corr = fma(CD4, q, CD3);
669 corr = fma(corr, q, CD2);
670 corr = fma(corr, q, one);
671 corr = fma(corr, q, one);
673 expmx2 = exp( -z*z );
674 expmx2 = expmx2 * corr;
676 pB1 = fma(CB9, w2, CB7);
677 pB0 = fma(CB8, w2, CB6);
678 pB1 = fma(pB1, w2, CB5);
679 pB0 = fma(pB0, w2, CB4);
680 pB1 = fma(pB1, w2, CB3);
681 pB0 = fma(pB0, w2, CB2);
682 pB1 = fma(pB1, w2, CB1);
683 pB0 = fma(pB0, w2, CB0);
684 pB0 = fma(pB1, w, pB0);
686 pC0 = fma(CC10, t2, CC8);
687 pC1 = fma(CC9, t2, CC7);
688 pC0 = fma(pC0, t2, CC6);
689 pC1 = fma(pC1, t2, CC5);
690 pC0 = fma(pC0, t2, CC4);
691 pC1 = fma(pC1, t2, CC3);
692 pC0 = fma(pC0, t2, CC2);
693 pC1 = fma(pC1, t2, CC1);
695 pC0 = fma(pC0, t2, CC0);
696 pC0 = fma(pC1, t, pC0);
699 // Select pB0 or pC0 for erfc()
701 res_erfc = blend(pB0, pC0, m);
702 res_erfc = res_erfc * expmx2;
704 // erfc(x<0) = 2-erfc(|x|)
706 res_erfc = blend(res_erfc, two-res_erfc, m);
708 // Select erf() or erfc()
709 res = blend(one-res_erf, res_erfc, msk_erf);
714 /*! \brief SIMD float sin \& cos.
716 * \param x The argument to evaluate sin/cos for
717 * \param[out] sinval Sin(x)
718 * \param[out] cosval Cos(x)
720 * This version achieves close to machine precision, but for very large
721 * magnitudes of the argument we inherently begin to lose accuracy due to the
722 * argument reduction, despite using extended precision arithmetics internally.
724 static inline void gmx_simdcall
725 sincos(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
727 // Constants to subtract Pi/4*x from y while minimizing precision loss
728 const SimdFloat argred0(-1.5703125);
729 const SimdFloat argred1(-4.83751296997070312500e-04f);
730 const SimdFloat argred2(-7.54953362047672271729e-08f);
731 const SimdFloat argred3(-2.56334406825708960298e-12f);
732 const SimdFloat two_over_pi(static_cast<float>(2.0f/M_PI));
733 const SimdFloat const_sin2(-1.9515295891e-4f);
734 const SimdFloat const_sin1( 8.3321608736e-3f);
735 const SimdFloat const_sin0(-1.6666654611e-1f);
736 const SimdFloat const_cos2( 2.443315711809948e-5f);
737 const SimdFloat const_cos1(-1.388731625493765e-3f);
738 const SimdFloat const_cos0( 4.166664568298827e-2f);
739 const SimdFloat half(0.5f);
740 const SimdFloat one(1.0f);
741 SimdFloat ssign, csign;
742 SimdFloat x2, y, z, psin, pcos, sss, ccc;
745 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
746 const SimdFInt32 ione(1);
747 const SimdFInt32 itwo(2);
754 m = cvtIB2B((iy & ione) == SimdFInt32(0));
755 ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
756 csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy+ione) & itwo) == itwo));
758 const SimdFloat quarter(0.25f);
759 const SimdFloat minusquarter(-0.25f);
761 SimdFBool m1, m2, m3;
763 /* The most obvious way to find the arguments quadrant in the unit circle
764 * to calculate the sign is to use integer arithmetic, but that is not
765 * present in all SIMD implementations. As an alternative, we have devised a
766 * pure floating-point algorithm that uses truncation for argument reduction
767 * so that we get a new value 0<=q<1 over the unit circle, and then
768 * do floating-point comparisons with fractions. This is likely to be
769 * slightly slower (~10%) due to the longer latencies of floating-point, so
770 * we only use it when integer SIMD arithmetic is not present.
774 // It is critical that half-way cases are rounded down
775 z = fma(x, two_over_pi, half);
779 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
780 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
781 * This removes the 2*Pi periodicity without using any integer arithmetic.
782 * First check if y had the value 2 or 3, set csign if true.
785 /* If we have logical operations we can work directly on the signbit, which
786 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
787 * Thus, if you are altering defines to debug alternative code paths, the
788 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
789 * active or inactive - you will get errors if only one is used.
791 # if GMX_SIMD_HAVE_LOGICAL
792 ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
793 csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
794 ssign = ssign ^ csign;
796 ssign = copysign(SimdFloat(1.0f), ssign);
797 csign = copysign(SimdFloat(1.0f), q);
799 ssign = ssign * csign; // swap ssign if csign was set.
801 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
802 m1 = (q < minusquarter);
803 m2 = (setZero() <= q);
807 // where mask is FALSE, swap sign.
808 csign = csign * blend(SimdFloat(-1.0f), one, m);
810 x = fma(y, argred0, x);
811 x = fma(y, argred1, x);
812 x = fma(y, argred2, x);
813 x = fma(y, argred3, x);
816 psin = fma(const_sin2, x2, const_sin1);
817 psin = fma(psin, x2, const_sin0);
818 psin = fma(psin, x * x2, x);
819 pcos = fma(const_cos2, x2, const_cos1);
820 pcos = fma(pcos, x2, const_cos0);
821 pcos = fms(pcos, x2, half);
822 pcos = fma(pcos, x2, one);
824 sss = blend(pcos, psin, m);
825 ccc = blend(psin, pcos, m);
826 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
827 #if GMX_SIMD_HAVE_LOGICAL
828 *sinval = sss ^ ssign;
829 *cosval = ccc ^ csign;
831 *sinval = sss * ssign;
832 *cosval = ccc * csign;
836 /*! \brief SIMD float sin(x).
838 * \param x The argument to evaluate sin for
841 * \attention Do NOT call both sin & cos if you need both results, since each of them
842 * will then call \ref sincos and waste a factor 2 in performance.
844 static inline SimdFloat gmx_simdcall
852 /*! \brief SIMD float cos(x).
854 * \param x The argument to evaluate cos for
857 * \attention Do NOT call both sin & cos if you need both results, since each of them
858 * will then call \ref sincos and waste a factor 2 in performance.
860 static inline SimdFloat gmx_simdcall
868 /*! \brief SIMD float tan(x).
870 * \param x The argument to evaluate tan for
873 static inline SimdFloat gmx_simdcall
876 const SimdFloat argred0(-1.5703125);
877 const SimdFloat argred1(-4.83751296997070312500e-04f);
878 const SimdFloat argred2(-7.54953362047672271729e-08f);
879 const SimdFloat argred3(-2.56334406825708960298e-12f);
880 const SimdFloat two_over_pi(static_cast<float>(2.0f/M_PI));
881 const SimdFloat CT6(0.009498288995810566122993911);
882 const SimdFloat CT5(0.002895755790837379295226923);
883 const SimdFloat CT4(0.02460087336161924491836265);
884 const SimdFloat CT3(0.05334912882656359828045988);
885 const SimdFloat CT2(0.1333989091464957704418495);
886 const SimdFloat CT1(0.3333307599244198227797507);
888 SimdFloat x2, p, y, z;
891 #if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
898 m = cvtIB2B((iy & ione) == ione);
900 x = fma(y, argred0, x);
901 x = fma(y, argred1, x);
902 x = fma(y, argred2, x);
903 x = fma(y, argred3, x);
904 x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
906 const SimdFloat quarter(0.25f);
907 const SimdFloat half(0.5f);
908 const SimdFloat threequarter(0.75f);
910 SimdFBool m1, m2, m3;
913 z = fma(w, two_over_pi, half);
919 m3 = threequarter <= q;
922 w = fma(y, argred0, w);
923 w = fma(y, argred1, w);
924 w = fma(y, argred2, w);
925 w = fma(y, argred3, w);
927 x = w * copysign( SimdFloat(1.0), x );
930 p = fma(CT6, x2, CT5);
935 p = fma(x2 * p, x, x);
937 p = blend( p, maskzInv(p, m), m);
941 /*! \brief SIMD float asin(x).
943 * \param x The argument to evaluate asin for
946 static inline SimdFloat gmx_simdcall
949 const SimdFloat limitlow(1e-4f);
950 const SimdFloat half(0.5f);
951 const SimdFloat one(1.0f);
952 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
953 const SimdFloat CC5(4.2163199048E-2f);
954 const SimdFloat CC4(2.4181311049E-2f);
955 const SimdFloat CC3(4.5470025998E-2f);
956 const SimdFloat CC2(7.4953002686E-2f);
957 const SimdFloat CC1(1.6666752422E-1f);
959 SimdFloat z, z1, z2, q, q1, q2;
965 z1 = half * (one-xabs);
967 q1 = z1 * maskzInvsqrt(z1, m2);
970 z = blend(z2, z1, m);
971 q = blend(q2, q1, m);
974 pA = fma(CC5, z2, CC3);
975 pB = fma(CC4, z2, CC2);
976 pA = fma(pA, z2, CC1);
985 z = blend( xabs, z, m );
991 /*! \brief SIMD float acos(x).
993 * \param x The argument to evaluate acos for
996 static inline SimdFloat gmx_simdcall
999 const SimdFloat one(1.0f);
1000 const SimdFloat half(0.5f);
1001 const SimdFloat pi(static_cast<float>(M_PI));
1002 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
1004 SimdFloat z, z1, z2, z3;
1005 SimdFBool m1, m2, m3;
1011 z = fnma(half, xabs, half);
1013 z = z * maskzInvsqrt(z, m3);
1014 z = blend(x, z, m1);
1020 z = blend(z1, z2, m2);
1021 z = blend(z3, z, m1);
1026 /*! \brief SIMD float asin(x).
1028 * \param x The argument to evaluate atan for
1029 * \result Atan(x), same argument/value range as standard math library.
1031 static inline SimdFloat gmx_simdcall
1034 const SimdFloat halfpi(static_cast<float>(M_PI/2.0f));
1035 const SimdFloat CA17(0.002823638962581753730774f);
1036 const SimdFloat CA15(-0.01595690287649631500244f);
1037 const SimdFloat CA13(0.04250498861074447631836f);
1038 const SimdFloat CA11(-0.07489009201526641845703f);
1039 const SimdFloat CA9 (0.1063479334115982055664f);
1040 const SimdFloat CA7 (-0.1420273631811141967773f);
1041 const SimdFloat CA5 (0.1999269574880599975585f);
1042 const SimdFloat CA3 (-0.3333310186862945556640f);
1043 const SimdFloat one (1.0f);
1044 SimdFloat x2, x3, x4, pA, pB;
1050 x = blend(x, maskzInv(x, m2), m2);
1055 pA = fma(CA17, x4, CA13);
1056 pB = fma(CA15, x4, CA11);
1057 pA = fma(pA, x4, CA9);
1058 pB = fma(pB, x4, CA7);
1059 pA = fma(pA, x4, CA5);
1060 pB = fma(pB, x4, CA3);
1061 pA = fma(pA, x2, pB);
1062 pA = fma(pA, x3, x);
1064 pA = blend(pA, halfpi-pA, m2);
1065 pA = blend(pA, -pA, m);
1070 /*! \brief SIMD float atan2(y,x).
1072 * \param y Y component of vector, any quartile
1073 * \param x X component of vector, any quartile
1074 * \result Atan(y,x), same argument/value range as standard math library.
1076 * \note This routine should provide correct results for all finite
1077 * non-zero or positive-zero arguments. However, negative zero arguments will
1078 * be treated as positive zero, which means the return value will deviate from
1079 * the standard math library atan2(y,x) for those cases. That should not be
1080 * of any concern in Gromacs, and in particular it will not affect calculations
1081 * of angles from vectors.
1083 static inline SimdFloat gmx_simdcall
1084 atan2(SimdFloat y, SimdFloat x)
1086 const SimdFloat pi(static_cast<float>(M_PI));
1087 const SimdFloat halfpi(static_cast<float>(M_PI/2.0));
1088 SimdFloat xinv, p, aoffset;
1089 SimdFBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1091 mask_xnz = x != setZero();
1092 mask_ynz = y != setZero();
1093 mask_xlt0 = x < setZero();
1094 mask_ylt0 = y < setZero();
1096 aoffset = selectByNotMask(halfpi, mask_xnz);
1097 aoffset = selectByMask(aoffset, mask_ynz);
1099 aoffset = blend(aoffset, pi, mask_xlt0);
1100 aoffset = blend(aoffset, -aoffset, mask_ylt0);
1102 xinv = maskzInv(x, mask_xnz);
1110 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1112 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1113 * \result Correction factor to coulomb force - see below for details.
1115 * This routine is meant to enable analytical evaluation of the
1116 * direct-space PME electrostatic force to avoid tables.
1118 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1119 * are some problems evaluating that:
1121 * First, the error function is difficult (read: expensive) to
1122 * approxmiate accurately for intermediate to large arguments, and
1123 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1124 * Second, we now try to avoid calculating potentials in Gromacs but
1125 * use forces directly.
1127 * We can simply things slight by noting that the PME part is really
1128 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1130 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1132 * The first term we already have from the inverse square root, so
1133 * that we can leave out of this routine.
1135 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1136 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1137 * the range used for the minimax fit. Use your favorite plotting program
1138 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1140 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1141 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1142 * then only use even powers. This is another minor optimization, since
1143 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1144 * the vector between the two atoms to get the vectorial force. The
1145 * fastest flops are the ones we can avoid calculating!
1147 * So, here's how it should be used:
1149 * 1. Calculate \f$r^2\f$.
1150 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1151 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1152 * 4. The return value is the expression:
1155 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1158 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1161 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1164 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1167 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1170 * With a bit of math exercise you should be able to confirm that
1174 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1177 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1178 * and you have your force (divided by \f$r\f$). A final multiplication
1179 * with the vector connecting the two particles and you have your
1180 * vectorial force to add to the particles.
1182 * This approximation achieves an error slightly lower than 1e-6
1183 * in single precision and 1e-11 in double precision
1184 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1185 * when added to \f$1/r\f$ the error will be insignificant.
1186 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1189 static inline SimdFloat gmx_simdcall
1190 pmeForceCorrection(SimdFloat z2)
1192 const SimdFloat FN6(-1.7357322914161492954e-8f);
1193 const SimdFloat FN5(1.4703624142580877519e-6f);
1194 const SimdFloat FN4(-0.000053401640219807709149f);
1195 const SimdFloat FN3(0.0010054721316683106153f);
1196 const SimdFloat FN2(-0.019278317264888380590f);
1197 const SimdFloat FN1(0.069670166153766424023f);
1198 const SimdFloat FN0(-0.75225204789749321333f);
1200 const SimdFloat FD4(0.0011193462567257629232f);
1201 const SimdFloat FD3(0.014866955030185295499f);
1202 const SimdFloat FD2(0.11583842382862377919f);
1203 const SimdFloat FD1(0.50736591960530292870f);
1204 const SimdFloat FD0(1.0f);
1207 SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1211 polyFD0 = fma(FD4, z4, FD2);
1212 polyFD1 = fma(FD3, z4, FD1);
1213 polyFD0 = fma(polyFD0, z4, FD0);
1214 polyFD0 = fma(polyFD1, z2, polyFD0);
1216 polyFD0 = inv(polyFD0);
1218 polyFN0 = fma(FN6, z4, FN4);
1219 polyFN1 = fma(FN5, z4, FN3);
1220 polyFN0 = fma(polyFN0, z4, FN2);
1221 polyFN1 = fma(polyFN1, z4, FN1);
1222 polyFN0 = fma(polyFN0, z4, FN0);
1223 polyFN0 = fma(polyFN1, z2, polyFN0);
1225 return polyFN0 * polyFD0;
1230 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1232 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1233 * \result Correction factor to coulomb potential - see below for details.
1235 * See \ref pmeForceCorrection for details about the approximation.
1237 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1238 * as the input argument.
1240 * Here's how it should be used:
1242 * 1. Calculate \f$r^2\f$.
1243 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1244 * 3. Evaluate this routine with z^2 as the argument.
1245 * 4. The return value is the expression:
1248 * \frac{\mbox{erf}(z)}{z}
1251 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1254 * \frac{\mbox{erf}(r \beta)}{r}
1257 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1258 * and you have your potential.
1260 * This approximation achieves an error slightly lower than 1e-6
1261 * in single precision and 4e-11 in double precision
1262 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1263 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1264 * when added to \f$1/r\f$ the error will be insignificant.
1265 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1267 static inline SimdFloat gmx_simdcall
1268 pmePotentialCorrection(SimdFloat z2)
1270 const SimdFloat VN6(1.9296833005951166339e-8f);
1271 const SimdFloat VN5(-1.4213390571557850962e-6f);
1272 const SimdFloat VN4(0.000041603292906656984871f);
1273 const SimdFloat VN3(-0.00013134036773265025626f);
1274 const SimdFloat VN2(0.038657983986041781264f);
1275 const SimdFloat VN1(0.11285044772717598220f);
1276 const SimdFloat VN0(1.1283802385263030286f);
1278 const SimdFloat VD3(0.0066752224023576045451f);
1279 const SimdFloat VD2(0.078647795836373922256f);
1280 const SimdFloat VD1(0.43336185284710920150f);
1281 const SimdFloat VD0(1.0f);
1284 SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1288 polyVD1 = fma(VD3, z4, VD1);
1289 polyVD0 = fma(VD2, z4, VD0);
1290 polyVD0 = fma(polyVD1, z2, polyVD0);
1292 polyVD0 = inv(polyVD0);
1294 polyVN0 = fma(VN6, z4, VN4);
1295 polyVN1 = fma(VN5, z4, VN3);
1296 polyVN0 = fma(polyVN0, z4, VN2);
1297 polyVN1 = fma(polyVN1, z4, VN1);
1298 polyVN0 = fma(polyVN0, z4, VN0);
1299 polyVN0 = fma(polyVN1, z2, polyVN0);
1301 return polyVN0 * polyVD0;
1307 #if GMX_SIMD_HAVE_DOUBLE
1310 /*! \name Double precision SIMD math functions
1312 * \note In most cases you should use the real-precision functions instead.
1316 /****************************************
1317 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1318 ****************************************/
1320 #if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1321 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1323 * \param x Values to set sign for
1324 * \param y Values used to set sign
1325 * \return Magnitude of x, sign of y
1327 static inline SimdDouble gmx_simdcall
1328 copysign(SimdDouble x, SimdDouble y)
1330 #if GMX_SIMD_HAVE_LOGICAL
1331 return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1333 return blend(abs(x), -abs(x), (y < setZero()));
1338 #if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1339 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1341 * This is a low-level routine that should only be used by SIMD math routine
1342 * that evaluates the inverse square root.
1344 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1345 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1346 * \return An improved approximation with roughly twice as many bits of accuracy.
1348 static inline SimdDouble gmx_simdcall
1349 rsqrtIter(SimdDouble lu, SimdDouble x)
1351 SimdDouble tmp1 = x*lu;
1352 SimdDouble tmp2 = SimdDouble(-0.5)*lu;
1353 tmp1 = fma(tmp1, lu, SimdDouble(-3.0));
1358 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1360 * \param x Argument that must be >0. This routine does not check arguments.
1361 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1363 static inline SimdDouble gmx_simdcall
1364 invsqrt(SimdDouble x)
1366 SimdDouble lu = rsqrt(x);
1367 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1368 lu = rsqrtIter(lu, x);
1370 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1371 lu = rsqrtIter(lu, x);
1373 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1374 lu = rsqrtIter(lu, x);
1376 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1377 lu = rsqrtIter(lu, x);
1382 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1384 * \param x0 First set of arguments, x0 must be positive - no argument checking.
1385 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
1386 * \param[out] out0 Result 1/sqrt(x0)
1387 * \param[out] out1 Result 1/sqrt(x1)
1389 * In particular for double precision we can sometimes calculate square root
1390 * pairs slightly faster by using single precision until the very last step.
1392 static inline void gmx_simdcall
1393 invsqrtPair(SimdDouble x0, SimdDouble x1,
1394 SimdDouble *out0, SimdDouble *out1)
1396 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1397 SimdFloat xf = cvtDD2F(x0, x1);
1398 SimdFloat luf = rsqrt(xf);
1399 SimdDouble lu0, lu1;
1400 // Intermediate target is single - mantissa+1 bits
1401 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1402 luf = rsqrtIter(luf, xf);
1404 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1405 luf = rsqrtIter(luf, xf);
1407 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1408 luf = rsqrtIter(luf, xf);
1410 cvtF2DD(luf, &lu0, &lu1);
1411 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1412 #if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1413 lu0 = rsqrtIter(lu0, x0);
1414 lu1 = rsqrtIter(lu1, x1);
1416 #if (GMX_SIMD_ACCURACY_BITS_SINGLE*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1417 lu0 = rsqrtIter(lu0, x0);
1418 lu1 = rsqrtIter(lu1, x1);
1423 *out0 = invsqrt(x0);
1424 *out1 = invsqrt(x1);
1428 #if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1429 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1431 * This is a low-level routine that should only be used by SIMD math routine
1432 * that evaluates the reciprocal.
1434 * \param lu Approximation of 1/x, typically obtained from lookup.
1435 * \param x The reference (starting) value x for which we want 1/x.
1436 * \return An improved approximation with roughly twice as many bits of accuracy.
1438 static inline SimdDouble gmx_simdcall
1439 rcpIter(SimdDouble lu, SimdDouble x)
1441 return lu*fnma(lu, x, SimdDouble(2.0));
1445 /*! \brief Calculate 1/x for SIMD double.
1447 * \param x Argument that must be nonzero. This routine does not check arguments.
1448 * \return 1/x. Result is undefined if your argument was invalid.
1450 static inline SimdDouble gmx_simdcall
1453 SimdDouble lu = rcp(x);
1454 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1455 lu = rcpIter(lu, x);
1457 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1458 lu = rcpIter(lu, x);
1460 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1461 lu = rcpIter(lu, x);
1463 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1464 lu = rcpIter(lu, x);
1469 /*! \brief Division for SIMD doubles
1471 * \param nom Nominator
1472 * \param denom Denominator
1476 * \note This function does not use any masking to avoid problems with
1477 * zero values in the denominator.
1479 static inline SimdDouble gmx_simdcall
1480 operator/(SimdDouble nom, SimdDouble denom)
1482 return nom*inv(denom);
1486 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1488 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1489 * Illegal values in the masked-out elements will not lead to
1490 * floating-point exceptions.
1492 * \param x Argument that must be >0 for masked-in entries
1494 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1495 * entry was not masked, and 0.0 for masked-out entries.
1497 static inline SimdDouble
1498 maskzInvsqrt(SimdDouble x, SimdDBool m)
1500 SimdDouble lu = maskzRsqrt(x, m);
1501 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1502 lu = rsqrtIter(lu, x);
1504 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1505 lu = rsqrtIter(lu, x);
1507 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1508 lu = rsqrtIter(lu, x);
1510 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1511 lu = rsqrtIter(lu, x);
1516 /*! \brief Calculate 1/x for SIMD double, masked version.
1518 * \param x Argument that must be nonzero for non-masked entries.
1520 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1522 static inline SimdDouble gmx_simdcall
1523 maskzInv(SimdDouble x, SimdDBool m)
1525 SimdDouble lu = maskzRcp(x, m);
1526 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1527 lu = rcpIter(lu, x);
1529 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1530 lu = rcpIter(lu, x);
1532 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1533 lu = rcpIter(lu, x);
1535 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1536 lu = rcpIter(lu, x);
1542 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1544 * \param x Argument that must be >=0.
1545 * \return sqrt(x). If x=0, the result will correctly be set to 0.
1546 * if x>0 && x<float_min, the result will incorrectly be set to 0.
1547 * The result is undefined if the input value is negative.
1549 static inline SimdDouble gmx_simdcall
1552 // As we might use a float version of rsqrt, we mask out small values
1553 return x * maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) <= x);
1556 #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
1557 /*! \brief SIMD double log(x). This is the natural logarithm.
1559 * \param x Argument, should be >0.
1560 * \result The natural logarithm of x. Undefined if argument is invalid.
1562 static inline SimdDouble gmx_simdcall
1565 const SimdDouble one(1.0);
1566 const SimdDouble two(2.0);
1567 const SimdDouble invsqrt2(1.0/std::sqrt(2.0));
1568 const SimdDouble corr(0.693147180559945286226764);
1569 const SimdDouble CL15(0.148197055177935105296783);
1570 const SimdDouble CL13(0.153108178020442575739679);
1571 const SimdDouble CL11(0.181837339521549679055568);
1572 const SimdDouble CL9(0.22222194152736701733275);
1573 const SimdDouble CL7(0.285714288030134544449368);
1574 const SimdDouble CL5(0.399999999989941956712869);
1575 const SimdDouble CL3(0.666666666666685503450651);
1576 const SimdDouble CL1(2.0);
1577 SimdDouble fExp, x2, p;
1581 x = frexp(x, &iExp);
1582 fExp = cvtI2R(iExp);
1585 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
1586 fExp = fExp - selectByMask(one, m);
1587 x = x * blend(one, two, m);
1589 x = (x-one) * inv( x+one );
1592 p = fma(CL15, x2, CL13);
1593 p = fma(p, x2, CL11);
1594 p = fma(p, x2, CL9);
1595 p = fma(p, x2, CL7);
1596 p = fma(p, x2, CL5);
1597 p = fma(p, x2, CL3);
1598 p = fma(p, x2, CL1);
1599 p = fma(p, x, corr * fExp);
1605 #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
1606 /*! \brief SIMD double 2^x.
1608 * \param x Argument.
1609 * \result 2^x. Undefined if input argument caused overflow.
1611 static inline SimdDouble gmx_simdcall
1614 const SimdDouble arglimit(1022.0);
1615 const SimdDouble CE11(4.435280790452730022081181e-10);
1616 const SimdDouble CE10(7.074105630863314448024247e-09);
1617 const SimdDouble CE9(1.017819803432096698472621e-07);
1618 const SimdDouble CE8(1.321543308956718799557863e-06);
1619 const SimdDouble CE7(0.00001525273348995851746990884);
1620 const SimdDouble CE6(0.0001540353046251466849082632);
1621 const SimdDouble CE5(0.001333355814678995257307880);
1622 const SimdDouble CE4(0.009618129107588335039176502);
1623 const SimdDouble CE3(0.05550410866481992147457793);
1624 const SimdDouble CE2(0.2402265069591015620470894);
1625 const SimdDouble CE1(0.6931471805599453304615075);
1626 const SimdDouble one(1.0);
1629 SimdDouble fexppart;
1633 fexppart = ldexp(one, cvtR2I(x));
1635 m = abs(x) <= arglimit;
1636 fexppart = selectByMask(fexppart, m);
1639 p = fma(CE11, x, CE10);
1655 #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
1656 /*! \brief SIMD double exp(x).
1658 * In addition to scaling the argument for 2^x this routine correctly does
1659 * extended precision arithmetics to improve accuracy.
1661 * \param x Argument.
1662 * \result exp(x). Undefined if input argument caused overflow,
1663 * which can happen if abs(x) \> 7e13.
1665 static inline SimdDouble gmx_simdcall
1668 const SimdDouble argscale(1.44269504088896340735992468100);
1669 const SimdDouble arglimit(1022.0);
1670 const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
1671 const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
1672 const SimdDouble CE12(2.078375306791423699350304e-09);
1673 const SimdDouble CE11(2.518173854179933105218635e-08);
1674 const SimdDouble CE10(2.755842049600488770111608e-07);
1675 const SimdDouble CE9(2.755691815216689746619849e-06);
1676 const SimdDouble CE8(2.480158383706245033920920e-05);
1677 const SimdDouble CE7(0.0001984127043518048611841321);
1678 const SimdDouble CE6(0.001388888889360258341755930);
1679 const SimdDouble CE5(0.008333333332907368102819109);
1680 const SimdDouble CE4(0.04166666666663836745814631);
1681 const SimdDouble CE3(0.1666666666666796929434570);
1682 const SimdDouble CE2(0.5);
1683 const SimdDouble one(1.0);
1684 SimdDouble fexppart;
1690 fexppart = ldexp(one, cvtR2I(y));
1692 m = (abs(y) <= arglimit);
1693 fexppart = selectByMask(fexppart, m);
1695 // Extended precision arithmetics
1696 x = fma(invargscale0, intpart, x);
1697 x = fma(invargscale1, intpart, x);
1699 p = fma(CE12, x, CE11);
1700 p = fma(p, x, CE10);
1709 p = fma(p, x * x, x);
1710 x = fma(p, fexppart, fexppart);
1716 /*! \brief SIMD double erf(x).
1718 * \param x The value to calculate erf(x) for.
1721 * This routine achieves very close to full precision, but we do not care about
1722 * the last bit or the subnormal result range.
1724 static inline SimdDouble gmx_simdcall
1727 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
1728 const SimdDouble CAP4(-0.431780540597889301512e-4);
1729 const SimdDouble CAP3(-0.00578562306260059236059);
1730 const SimdDouble CAP2(-0.028593586920219752446);
1731 const SimdDouble CAP1(-0.315924962948621698209);
1732 const SimdDouble CAP0(0.14952975608477029151);
1734 const SimdDouble CAQ5(-0.374089300177174709737e-5);
1735 const SimdDouble CAQ4(0.00015126584532155383535);
1736 const SimdDouble CAQ3(0.00536692680669480725423);
1737 const SimdDouble CAQ2(0.0668686825594046122636);
1738 const SimdDouble CAQ1(0.402604990869284362773);
1740 const SimdDouble CAoffset(0.9788494110107421875);
1742 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
1743 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
1744 const SimdDouble CBP5(0.00119770193298159629350136085658);
1745 const SimdDouble CBP4(0.0164944422378370965881008942733);
1746 const SimdDouble CBP3(0.0984581468691775932063932439252);
1747 const SimdDouble CBP2(0.317364595806937763843589437418);
1748 const SimdDouble CBP1(0.554167062641455850932670067075);
1749 const SimdDouble CBP0(0.427583576155807163756925301060);
1750 const SimdDouble CBQ7(0.00212288829699830145976198384930);
1751 const SimdDouble CBQ6(0.0334810979522685300554606393425);
1752 const SimdDouble CBQ5(0.2361713785181450957579508850717);
1753 const SimdDouble CBQ4(0.955364736493055670530981883072);
1754 const SimdDouble CBQ3(2.36815675631420037315349279199);
1755 const SimdDouble CBQ2(3.55261649184083035537184223542);
1756 const SimdDouble CBQ1(2.93501136050160872574376997993);
1759 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
1760 const SimdDouble CCP6(-2.8175401114513378771);
1761 const SimdDouble CCP5(-3.22729451764143718517);
1762 const SimdDouble CCP4(-2.5518551727311523996);
1763 const SimdDouble CCP3(-0.687717681153649930619);
1764 const SimdDouble CCP2(-0.212652252872804219852);
1765 const SimdDouble CCP1(0.0175389834052493308818);
1766 const SimdDouble CCP0(0.00628057170626964891937);
1768 const SimdDouble CCQ6(5.48409182238641741584);
1769 const SimdDouble CCQ5(13.5064170191802889145);
1770 const SimdDouble CCQ4(22.9367376522880577224);
1771 const SimdDouble CCQ3(15.930646027911794143);
1772 const SimdDouble CCQ2(11.0567237927800161565);
1773 const SimdDouble CCQ1(2.79257750980575282228);
1775 const SimdDouble CCoffset(0.5579090118408203125);
1777 const SimdDouble one(1.0);
1778 const SimdDouble two(2.0);
1780 SimdDouble xabs, x2, x4, t, t2, w, w2;
1781 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1782 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1783 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1784 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
1786 SimdDBool mask, mask_erf, notmask_erf;
1790 mask_erf = (xabs < one);
1791 notmask_erf = (one <= xabs);
1795 PolyAP0 = fma(CAP4, x4, CAP2);
1796 PolyAP1 = fma(CAP3, x4, CAP1);
1797 PolyAP0 = fma(PolyAP0, x4, CAP0);
1798 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
1800 PolyAQ1 = fma(CAQ5, x4, CAQ3);
1801 PolyAQ0 = fma(CAQ4, x4, CAQ2);
1802 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
1803 PolyAQ0 = fma(PolyAQ0, x4, one);
1804 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
1806 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
1807 res_erf = CAoffset + res_erf;
1808 res_erf = x * res_erf;
1810 // Calculate erfc() in range [1,4.5]
1814 PolyBP0 = fma(CBP6, t2, CBP4);
1815 PolyBP1 = fma(CBP5, t2, CBP3);
1816 PolyBP0 = fma(PolyBP0, t2, CBP2);
1817 PolyBP1 = fma(PolyBP1, t2, CBP1);
1818 PolyBP0 = fma(PolyBP0, t2, CBP0);
1819 PolyBP0 = fma(PolyBP1, t, PolyBP0);
1821 PolyBQ1 = fma(CBQ7, t2, CBQ5);
1822 PolyBQ0 = fma(CBQ6, t2, CBQ4);
1823 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
1824 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
1825 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
1826 PolyBQ0 = fma(PolyBQ0, t2, one);
1827 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
1829 // The denominator polynomial can be zero outside the range
1830 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
1832 res_erfcB = res_erfcB * xabs;
1834 // Calculate erfc() in range [4.5,inf]
1835 w = maskzInv(xabs, notmask_erf);
1838 PolyCP0 = fma(CCP6, w2, CCP4);
1839 PolyCP1 = fma(CCP5, w2, CCP3);
1840 PolyCP0 = fma(PolyCP0, w2, CCP2);
1841 PolyCP1 = fma(PolyCP1, w2, CCP1);
1842 PolyCP0 = fma(PolyCP0, w2, CCP0);
1843 PolyCP0 = fma(PolyCP1, w, PolyCP0);
1845 PolyCQ0 = fma(CCQ6, w2, CCQ4);
1846 PolyCQ1 = fma(CCQ5, w2, CCQ3);
1847 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
1848 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
1849 PolyCQ0 = fma(PolyCQ0, w2, one);
1850 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
1852 expmx2 = exp( -x2 );
1854 // The denominator polynomial can be zero outside the range
1855 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
1856 res_erfcC = res_erfcC + CCoffset;
1857 res_erfcC = res_erfcC * w;
1859 mask = (SimdDouble(4.5) < xabs);
1860 res_erfc = blend(res_erfcB, res_erfcC, mask);
1862 res_erfc = res_erfc * expmx2;
1864 // erfc(x<0) = 2-erfc(|x|)
1865 mask = (x < setZero());
1866 res_erfc = blend(res_erfc, two - res_erfc, mask);
1868 // Select erf() or erfc()
1869 res = blend(one - res_erfc, res_erf, mask_erf);
1874 /*! \brief SIMD double erfc(x).
1876 * \param x The value to calculate erfc(x) for.
1879 * This routine achieves full precision (bar the last bit) over most of the
1880 * input range, but for large arguments where the result is getting close
1881 * to the minimum representable numbers we accept slightly larger errors
1882 * (think results that are in the ballpark of 10^-200 for double)
1883 * since that is not relevant for MD.
1885 static inline SimdDouble gmx_simdcall
1888 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
1889 const SimdDouble CAP4(-0.431780540597889301512e-4);
1890 const SimdDouble CAP3(-0.00578562306260059236059);
1891 const SimdDouble CAP2(-0.028593586920219752446);
1892 const SimdDouble CAP1(-0.315924962948621698209);
1893 const SimdDouble CAP0(0.14952975608477029151);
1895 const SimdDouble CAQ5(-0.374089300177174709737e-5);
1896 const SimdDouble CAQ4(0.00015126584532155383535);
1897 const SimdDouble CAQ3(0.00536692680669480725423);
1898 const SimdDouble CAQ2(0.0668686825594046122636);
1899 const SimdDouble CAQ1(0.402604990869284362773);
1901 const SimdDouble CAoffset(0.9788494110107421875);
1903 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
1904 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
1905 const SimdDouble CBP5(0.00119770193298159629350136085658);
1906 const SimdDouble CBP4(0.0164944422378370965881008942733);
1907 const SimdDouble CBP3(0.0984581468691775932063932439252);
1908 const SimdDouble CBP2(0.317364595806937763843589437418);
1909 const SimdDouble CBP1(0.554167062641455850932670067075);
1910 const SimdDouble CBP0(0.427583576155807163756925301060);
1911 const SimdDouble CBQ7(0.00212288829699830145976198384930);
1912 const SimdDouble CBQ6(0.0334810979522685300554606393425);
1913 const SimdDouble CBQ5(0.2361713785181450957579508850717);
1914 const SimdDouble CBQ4(0.955364736493055670530981883072);
1915 const SimdDouble CBQ3(2.36815675631420037315349279199);
1916 const SimdDouble CBQ2(3.55261649184083035537184223542);
1917 const SimdDouble CBQ1(2.93501136050160872574376997993);
1920 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
1921 const SimdDouble CCP6(-2.8175401114513378771);
1922 const SimdDouble CCP5(-3.22729451764143718517);
1923 const SimdDouble CCP4(-2.5518551727311523996);
1924 const SimdDouble CCP3(-0.687717681153649930619);
1925 const SimdDouble CCP2(-0.212652252872804219852);
1926 const SimdDouble CCP1(0.0175389834052493308818);
1927 const SimdDouble CCP0(0.00628057170626964891937);
1929 const SimdDouble CCQ6(5.48409182238641741584);
1930 const SimdDouble CCQ5(13.5064170191802889145);
1931 const SimdDouble CCQ4(22.9367376522880577224);
1932 const SimdDouble CCQ3(15.930646027911794143);
1933 const SimdDouble CCQ2(11.0567237927800161565);
1934 const SimdDouble CCQ1(2.79257750980575282228);
1936 const SimdDouble CCoffset(0.5579090118408203125);
1938 const SimdDouble one(1.0);
1939 const SimdDouble two(2.0);
1941 SimdDouble xabs, x2, x4, t, t2, w, w2;
1942 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1943 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1944 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1945 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
1947 SimdDBool mask, mask_erf, notmask_erf;
1951 mask_erf = (xabs < one);
1952 notmask_erf = (one <= xabs);
1956 PolyAP0 = fma(CAP4, x4, CAP2);
1957 PolyAP1 = fma(CAP3, x4, CAP1);
1958 PolyAP0 = fma(PolyAP0, x4, CAP0);
1959 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
1960 PolyAQ1 = fma(CAQ5, x4, CAQ3);
1961 PolyAQ0 = fma(CAQ4, x4, CAQ2);
1962 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
1963 PolyAQ0 = fma(PolyAQ0, x4, one);
1964 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
1966 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf);
1967 res_erf = CAoffset + res_erf;
1968 res_erf = x * res_erf;
1970 // Calculate erfc() in range [1,4.5]
1974 PolyBP0 = fma(CBP6, t2, CBP4);
1975 PolyBP1 = fma(CBP5, t2, CBP3);
1976 PolyBP0 = fma(PolyBP0, t2, CBP2);
1977 PolyBP1 = fma(PolyBP1, t2, CBP1);
1978 PolyBP0 = fma(PolyBP0, t2, CBP0);
1979 PolyBP0 = fma(PolyBP1, t, PolyBP0);
1981 PolyBQ1 = fma(CBQ7, t2, CBQ5);
1982 PolyBQ0 = fma(CBQ6, t2, CBQ4);
1983 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
1984 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
1985 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
1986 PolyBQ0 = fma(PolyBQ0, t2, one);
1987 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
1989 // The denominator polynomial can be zero outside the range
1990 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf);
1992 res_erfcB = res_erfcB * xabs;
1994 // Calculate erfc() in range [4.5,inf]
1995 w = maskzInv(xabs, xabs != setZero());
1998 PolyCP0 = fma(CCP6, w2, CCP4);
1999 PolyCP1 = fma(CCP5, w2, CCP3);
2000 PolyCP0 = fma(PolyCP0, w2, CCP2);
2001 PolyCP1 = fma(PolyCP1, w2, CCP1);
2002 PolyCP0 = fma(PolyCP0, w2, CCP0);
2003 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2005 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2006 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2007 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2008 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2009 PolyCQ0 = fma(PolyCQ0, w2, one);
2010 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2012 expmx2 = exp( -x2 );
2014 // The denominator polynomial can be zero outside the range
2015 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf);
2016 res_erfcC = res_erfcC + CCoffset;
2017 res_erfcC = res_erfcC * w;
2019 mask = (SimdDouble(4.5) < xabs);
2020 res_erfc = blend(res_erfcB, res_erfcC, mask);
2022 res_erfc = res_erfc * expmx2;
2024 // erfc(x<0) = 2-erfc(|x|)
2025 mask = (x < setZero());
2026 res_erfc = blend(res_erfc, two - res_erfc, mask);
2028 // Select erf() or erfc()
2029 res = blend(res_erfc, one - res_erf, mask_erf);
2034 /*! \brief SIMD double sin \& cos.
2036 * \param x The argument to evaluate sin/cos for
2037 * \param[out] sinval Sin(x)
2038 * \param[out] cosval Cos(x)
2040 * This version achieves close to machine precision, but for very large
2041 * magnitudes of the argument we inherently begin to lose accuracy due to the
2042 * argument reduction, despite using extended precision arithmetics internally.
2044 static inline void gmx_simdcall
2045 sincos(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
2047 // Constants to subtract Pi/4*x from y while minimizing precision loss
2048 const SimdDouble argred0(-2*0.78539816290140151978);
2049 const SimdDouble argred1(-2*4.9604678871439933374e-10);
2050 const SimdDouble argred2(-2*1.1258708853173288931e-18);
2051 const SimdDouble argred3(-2*1.7607799325916000908e-27);
2052 const SimdDouble two_over_pi(2.0/M_PI);
2053 const SimdDouble const_sin5( 1.58938307283228937328511e-10);
2054 const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2055 const SimdDouble const_sin3( 2.75573131776846360512547e-06);
2056 const SimdDouble const_sin2(-0.000198412698278911770864914);
2057 const SimdDouble const_sin1( 0.0083333333333191845961746);
2058 const SimdDouble const_sin0(-0.166666666666666130709393);
2060 const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2061 const SimdDouble const_cos6( 2.08757471207040055479366e-09);
2062 const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2063 const SimdDouble const_cos4( 2.48015872890001867311915e-05);
2064 const SimdDouble const_cos3(-0.00138888888888714019282329);
2065 const SimdDouble const_cos2( 0.0416666666666665519592062);
2066 const SimdDouble half(0.5);
2067 const SimdDouble one(1.0);
2068 SimdDouble ssign, csign;
2069 SimdDouble x2, y, z, psin, pcos, sss, ccc;
2071 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2072 const SimdDInt32 ione(1);
2073 const SimdDInt32 itwo(2);
2076 z = x * two_over_pi;
2080 mask = cvtIB2B((iy & ione) == setZero());
2081 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2082 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2084 const SimdDouble quarter(0.25);
2085 const SimdDouble minusquarter(-0.25);
2087 SimdDBool m1, m2, m3;
2089 /* The most obvious way to find the arguments quadrant in the unit circle
2090 * to calculate the sign is to use integer arithmetic, but that is not
2091 * present in all SIMD implementations. As an alternative, we have devised a
2092 * pure floating-point algorithm that uses truncation for argument reduction
2093 * so that we get a new value 0<=q<1 over the unit circle, and then
2094 * do floating-point comparisons with fractions. This is likely to be
2095 * slightly slower (~10%) due to the longer latencies of floating-point, so
2096 * we only use it when integer SIMD arithmetic is not present.
2100 // It is critical that half-way cases are rounded down
2101 z = fma(x, two_over_pi, half);
2105 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2106 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2107 * This removes the 2*Pi periodicity without using any integer arithmetic.
2108 * First check if y had the value 2 or 3, set csign if true.
2111 /* If we have logical operations we can work directly on the signbit, which
2112 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2113 * Thus, if you are altering defines to debug alternative code paths, the
2114 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2115 * active or inactive - you will get errors if only one is used.
2117 # if GMX_SIMD_HAVE_LOGICAL
2118 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2119 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2120 ssign = ssign ^ csign;
2122 ssign = copysign(SimdDouble(1.0), ssign);
2123 csign = copysign(SimdDouble(1.0), q);
2125 ssign = ssign * csign; // swap ssign if csign was set.
2127 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2128 m1 = (q < minusquarter);
2129 m2 = (setZero() <= q);
2133 // where mask is FALSE, swap sign.
2134 csign = csign * blend(SimdDouble(-1.0), one, mask);
2136 x = fma(y, argred0, x);
2137 x = fma(y, argred1, x);
2138 x = fma(y, argred2, x);
2139 x = fma(y, argred3, x);
2142 psin = fma(const_sin5, x2, const_sin4);
2143 psin = fma(psin, x2, const_sin3);
2144 psin = fma(psin, x2, const_sin2);
2145 psin = fma(psin, x2, const_sin1);
2146 psin = fma(psin, x2, const_sin0);
2147 psin = fma(psin, x2 * x, x);
2149 pcos = fma(const_cos7, x2, const_cos6);
2150 pcos = fma(pcos, x2, const_cos5);
2151 pcos = fma(pcos, x2, const_cos4);
2152 pcos = fma(pcos, x2, const_cos3);
2153 pcos = fma(pcos, x2, const_cos2);
2154 pcos = fms(pcos, x2, half);
2155 pcos = fma(pcos, x2, one);
2157 sss = blend(pcos, psin, mask);
2158 ccc = blend(psin, pcos, mask);
2159 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2160 #if GMX_SIMD_HAVE_LOGICAL
2161 *sinval = sss ^ ssign;
2162 *cosval = ccc ^ csign;
2164 *sinval = sss * ssign;
2165 *cosval = ccc * csign;
2169 /*! \brief SIMD double sin(x).
2171 * \param x The argument to evaluate sin for
2174 * \attention Do NOT call both sin & cos if you need both results, since each of them
2175 * will then call \ref sincos and waste a factor 2 in performance.
2177 static inline SimdDouble gmx_simdcall
2185 /*! \brief SIMD double cos(x).
2187 * \param x The argument to evaluate cos for
2190 * \attention Do NOT call both sin & cos if you need both results, since each of them
2191 * will then call \ref sincos and waste a factor 2 in performance.
2193 static inline SimdDouble gmx_simdcall
2201 /*! \brief SIMD double tan(x).
2203 * \param x The argument to evaluate tan for
2206 static inline SimdDouble gmx_simdcall
2209 const SimdDouble argred0(-2*0.78539816290140151978);
2210 const SimdDouble argred1(-2*4.9604678871439933374e-10);
2211 const SimdDouble argred2(-2*1.1258708853173288931e-18);
2212 const SimdDouble argred3(-2*1.7607799325916000908e-27);
2213 const SimdDouble two_over_pi(2.0/M_PI);
2214 const SimdDouble CT15(1.01419718511083373224408e-05);
2215 const SimdDouble CT14(-2.59519791585924697698614e-05);
2216 const SimdDouble CT13(5.23388081915899855325186e-05);
2217 const SimdDouble CT12(-3.05033014433946488225616e-05);
2218 const SimdDouble CT11(7.14707504084242744267497e-05);
2219 const SimdDouble CT10(8.09674518280159187045078e-05);
2220 const SimdDouble CT9(0.000244884931879331847054404);
2221 const SimdDouble CT8(0.000588505168743587154904506);
2222 const SimdDouble CT7(0.00145612788922812427978848);
2223 const SimdDouble CT6(0.00359208743836906619142924);
2224 const SimdDouble CT5(0.00886323944362401618113356);
2225 const SimdDouble CT4(0.0218694882853846389592078);
2226 const SimdDouble CT3(0.0539682539781298417636002);
2227 const SimdDouble CT2(0.133333333333125941821962);
2228 const SimdDouble CT1(0.333333333333334980164153);
2230 SimdDouble x2, p, y, z;
2233 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2237 z = x * two_over_pi;
2240 m = cvtIB2B((iy & ione) == ione);
2242 x = fma(y, argred0, x);
2243 x = fma(y, argred1, x);
2244 x = fma(y, argred2, x);
2245 x = fma(y, argred3, x);
2246 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2248 const SimdDouble quarter(0.25);
2249 const SimdDouble half(0.5);
2250 const SimdDouble threequarter(0.75);
2252 SimdDBool m1, m2, m3;
2255 z = fma(w, two_over_pi, half);
2259 m1 = (quarter <= q);
2261 m3 = (threequarter <= q);
2264 w = fma(y, argred0, w);
2265 w = fma(y, argred1, w);
2266 w = fma(y, argred2, w);
2267 w = fma(y, argred3, w);
2269 w = blend(w, -w, m);
2270 x = w * copysign( SimdDouble(1.0), x );
2273 p = fma(CT15, x2, CT14);
2274 p = fma(p, x2, CT13);
2275 p = fma(p, x2, CT12);
2276 p = fma(p, x2, CT11);
2277 p = fma(p, x2, CT10);
2278 p = fma(p, x2, CT9);
2279 p = fma(p, x2, CT8);
2280 p = fma(p, x2, CT7);
2281 p = fma(p, x2, CT6);
2282 p = fma(p, x2, CT5);
2283 p = fma(p, x2, CT4);
2284 p = fma(p, x2, CT3);
2285 p = fma(p, x2, CT2);
2286 p = fma(p, x2, CT1);
2287 p = fma(x2, p * x, x);
2289 p = blend( p, maskzInv(p, m), m);
2293 /*! \brief SIMD double asin(x).
2295 * \param x The argument to evaluate asin for
2298 static inline SimdDouble gmx_simdcall
2301 // Same algorithm as cephes library
2302 const SimdDouble limit1(0.625);
2303 const SimdDouble limit2(1e-8);
2304 const SimdDouble one(1.0);
2305 const SimdDouble quarterpi(M_PI/4.0);
2306 const SimdDouble morebits(6.123233995736765886130e-17);
2308 const SimdDouble P5(4.253011369004428248960e-3);
2309 const SimdDouble P4(-6.019598008014123785661e-1);
2310 const SimdDouble P3(5.444622390564711410273e0);
2311 const SimdDouble P2(-1.626247967210700244449e1);
2312 const SimdDouble P1(1.956261983317594739197e1);
2313 const SimdDouble P0(-8.198089802484824371615e0);
2315 const SimdDouble Q4(-1.474091372988853791896e1);
2316 const SimdDouble Q3(7.049610280856842141659e1);
2317 const SimdDouble Q2(-1.471791292232726029859e2);
2318 const SimdDouble Q1(1.395105614657485689735e2);
2319 const SimdDouble Q0(-4.918853881490881290097e1);
2321 const SimdDouble R4(2.967721961301243206100e-3);
2322 const SimdDouble R3(-5.634242780008963776856e-1);
2323 const SimdDouble R2(6.968710824104713396794e0);
2324 const SimdDouble R1(-2.556901049652824852289e1);
2325 const SimdDouble R0(2.853665548261061424989e1);
2327 const SimdDouble S3(-2.194779531642920639778e1);
2328 const SimdDouble S2(1.470656354026814941758e2);
2329 const SimdDouble S1(-3.838770957603691357202e2);
2330 const SimdDouble S0(3.424398657913078477438e2);
2333 SimdDouble zz, ww, z, q, w, zz2, ww2;
2338 SimdDouble nom, denom;
2339 SimdDBool mask, mask2;
2343 mask = (limit1 < xabs);
2351 RA = fma(R4, zz2, R2);
2352 RB = fma(R3, zz2, R1);
2353 RA = fma(RA, zz2, R0);
2354 RA = fma(RB, zz, RA);
2357 SB = fma(S3, zz2, S1);
2359 SA = fma(SA, zz2, S0);
2360 SA = fma(SB, zz, SA);
2363 PA = fma(P5, ww2, P3);
2364 PB = fma(P4, ww2, P2);
2365 PA = fma(PA, ww2, P1);
2366 PB = fma(PB, ww2, P0);
2367 PA = fma(PA, ww, PB);
2370 QB = fma(Q4, ww2, Q2);
2372 QA = fma(QA, ww2, Q1);
2373 QB = fma(QB, ww2, Q0);
2374 QA = fma(QA, ww, QB);
2379 nom = blend( PA, RA, mask );
2380 denom = blend( QA, SA, mask );
2382 mask2 = (limit2 < xabs);
2383 q = nom * maskzInv(denom, mask2);
2388 zz = fms(zz, q, morebits);
2395 z = blend( w, z, mask );
2397 z = blend( xabs, z, mask2 );
2404 /*! \brief SIMD double acos(x).
2406 * \param x The argument to evaluate acos for
2409 static inline SimdDouble gmx_simdcall
2412 const SimdDouble one(1.0);
2413 const SimdDouble half(0.5);
2414 const SimdDouble quarterpi0(7.85398163397448309616e-1);
2415 const SimdDouble quarterpi1(6.123233995736765886130e-17);
2418 SimdDouble z, z1, z2;
2421 z1 = half * (one - x);
2423 z = blend( x, z1, mask1 );
2429 z2 = quarterpi0 - z;
2430 z2 = z2 + quarterpi1;
2431 z2 = z2 + quarterpi0;
2433 z = blend(z2, z1, mask1);
2438 /*! \brief SIMD double asin(x).
2440 * \param x The argument to evaluate atan for
2441 * \result Atan(x), same argument/value range as standard math library.
2443 static inline SimdDouble gmx_simdcall
2446 // Same algorithm as cephes library
2447 const SimdDouble limit1(0.66);
2448 const SimdDouble limit2(2.41421356237309504880);
2449 const SimdDouble quarterpi(M_PI/4.0);
2450 const SimdDouble halfpi(M_PI/2.0);
2451 const SimdDouble mone(-1.0);
2452 const SimdDouble morebits1(0.5*6.123233995736765886130E-17);
2453 const SimdDouble morebits2(6.123233995736765886130E-17);
2455 const SimdDouble P4(-8.750608600031904122785E-1);
2456 const SimdDouble P3(-1.615753718733365076637E1);
2457 const SimdDouble P2(-7.500855792314704667340E1);
2458 const SimdDouble P1(-1.228866684490136173410E2);
2459 const SimdDouble P0(-6.485021904942025371773E1);
2461 const SimdDouble Q4(2.485846490142306297962E1);
2462 const SimdDouble Q3(1.650270098316988542046E2);
2463 const SimdDouble Q2(4.328810604912902668951E2);
2464 const SimdDouble Q1(4.853903996359136964868E2);
2465 const SimdDouble Q0(1.945506571482613964425E2);
2467 SimdDouble y, xabs, t1, t2;
2469 SimdDouble P_A, P_B, Q_A, Q_B;
2470 SimdDBool mask1, mask2;
2474 mask1 = (limit1 < xabs);
2475 mask2 = (limit2 < xabs);
2477 t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
2478 t2 = mone * maskzInv(xabs, mask2);
2480 y = selectByMask(quarterpi, mask1);
2481 y = blend(y, halfpi, mask2);
2482 xabs = blend(xabs, t1, mask1);
2483 xabs = blend(xabs, t2, mask2);
2488 P_A = fma(P4, z2, P2);
2489 P_B = fma(P3, z2, P1);
2490 P_A = fma(P_A, z2, P0);
2491 P_A = fma(P_B, z, P_A);
2494 Q_B = fma(Q4, z2, Q2);
2496 Q_A = fma(Q_A, z2, Q1);
2497 Q_B = fma(Q_B, z2, Q0);
2498 Q_A = fma(Q_A, z, Q_B);
2502 z = fma(z, xabs, xabs);
2504 t1 = selectByMask(morebits1, mask1);
2505 t1 = blend(t1, morebits2, mask2);
2515 /*! \brief SIMD double atan2(y,x).
2517 * \param y Y component of vector, any quartile
2518 * \param x X component of vector, any quartile
2519 * \result Atan(y,x), same argument/value range as standard math library.
2521 * \note This routine should provide correct results for all finite
2522 * non-zero or positive-zero arguments. However, negative zero arguments will
2523 * be treated as positive zero, which means the return value will deviate from
2524 * the standard math library atan2(y,x) for those cases. That should not be
2525 * of any concern in Gromacs, and in particular it will not affect calculations
2526 * of angles from vectors.
2528 static inline SimdDouble gmx_simdcall
2529 atan2(SimdDouble y, SimdDouble x)
2531 const SimdDouble pi(M_PI);
2532 const SimdDouble halfpi(M_PI/2.0);
2533 SimdDouble xinv, p, aoffset;
2534 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
2536 mask_xnz = x != setZero();
2537 mask_ynz = y != setZero();
2538 mask_xlt0 = (x < setZero());
2539 mask_ylt0 = (y < setZero());
2541 aoffset = selectByNotMask(halfpi, mask_xnz);
2542 aoffset = selectByMask(aoffset, mask_ynz);
2544 aoffset = blend(aoffset, pi, mask_xlt0);
2545 aoffset = blend(aoffset, -aoffset, mask_ylt0);
2547 xinv = maskzInv(x, mask_xnz);
2556 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
2558 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2559 * interaction distance and beta the ewald splitting parameters.
2560 * \result Correction factor to coulomb force.
2562 * This routine is meant to enable analytical evaluation of the
2563 * direct-space PME electrostatic force to avoid tables. For details, see the
2564 * single precision function.
2566 static inline SimdDouble gmx_simdcall
2567 pmeForceCorrection(SimdDouble z2)
2569 const SimdDouble FN10(-8.0072854618360083154e-14);
2570 const SimdDouble FN9(1.1859116242260148027e-11);
2571 const SimdDouble FN8(-8.1490406329798423616e-10);
2572 const SimdDouble FN7(3.4404793543907847655e-8);
2573 const SimdDouble FN6(-9.9471420832602741006e-7);
2574 const SimdDouble FN5(0.000020740315999115847456);
2575 const SimdDouble FN4(-0.00031991745139313364005);
2576 const SimdDouble FN3(0.0035074449373659008203);
2577 const SimdDouble FN2(-0.031750380176100813405);
2578 const SimdDouble FN1(0.13884101728898463426);
2579 const SimdDouble FN0(-0.75225277815249618847);
2581 const SimdDouble FD5(0.000016009278224355026701);
2582 const SimdDouble FD4(0.00051055686934806966046);
2583 const SimdDouble FD3(0.0081803507497974289008);
2584 const SimdDouble FD2(0.077181146026670287235);
2585 const SimdDouble FD1(0.41543303143712535988);
2586 const SimdDouble FD0(1.0);
2589 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
2593 polyFD1 = fma(FD5, z4, FD3);
2594 polyFD1 = fma(polyFD1, z4, FD1);
2595 polyFD1 = polyFD1 * z2;
2596 polyFD0 = fma(FD4, z4, FD2);
2597 polyFD0 = fma(polyFD0, z4, FD0);
2598 polyFD0 = polyFD0 + polyFD1;
2600 polyFD0 = inv(polyFD0);
2602 polyFN0 = fma(FN10, z4, FN8);
2603 polyFN0 = fma(polyFN0, z4, FN6);
2604 polyFN0 = fma(polyFN0, z4, FN4);
2605 polyFN0 = fma(polyFN0, z4, FN2);
2606 polyFN0 = fma(polyFN0, z4, FN0);
2607 polyFN1 = fma(FN9, z4, FN7);
2608 polyFN1 = fma(polyFN1, z4, FN5);
2609 polyFN1 = fma(polyFN1, z4, FN3);
2610 polyFN1 = fma(polyFN1, z4, FN1);
2611 polyFN0 = fma(polyFN1, z2, polyFN0);
2614 return polyFN0 * polyFD0;
2619 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
2621 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
2622 * interaction distance and beta the ewald splitting parameters.
2623 * \result Correction factor to coulomb force.
2625 * This routine is meant to enable analytical evaluation of the
2626 * direct-space PME electrostatic potential to avoid tables. For details, see the
2627 * single precision function.
2629 static inline SimdDouble gmx_simdcall
2630 pmePotentialCorrection(SimdDouble z2)
2632 const SimdDouble VN9(-9.3723776169321855475e-13);
2633 const SimdDouble VN8(1.2280156762674215741e-10);
2634 const SimdDouble VN7(-7.3562157912251309487e-9);
2635 const SimdDouble VN6(2.6215886208032517509e-7);
2636 const SimdDouble VN5(-4.9532491651265819499e-6);
2637 const SimdDouble VN4(0.00025907400778966060389);
2638 const SimdDouble VN3(0.0010585044856156469792);
2639 const SimdDouble VN2(0.045247661136833092885);
2640 const SimdDouble VN1(0.11643931522926034421);
2641 const SimdDouble VN0(1.1283791671726767970);
2643 const SimdDouble VD5(0.000021784709867336150342);
2644 const SimdDouble VD4(0.00064293662010911388448);
2645 const SimdDouble VD3(0.0096311444822588683504);
2646 const SimdDouble VD2(0.085608012351550627051);
2647 const SimdDouble VD1(0.43652499166614811084);
2648 const SimdDouble VD0(1.0);
2651 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
2655 polyVD1 = fma(VD5, z4, VD3);
2656 polyVD0 = fma(VD4, z4, VD2);
2657 polyVD1 = fma(polyVD1, z4, VD1);
2658 polyVD0 = fma(polyVD0, z4, VD0);
2659 polyVD0 = fma(polyVD1, z2, polyVD0);
2661 polyVD0 = inv(polyVD0);
2663 polyVN1 = fma(VN9, z4, VN7);
2664 polyVN0 = fma(VN8, z4, VN6);
2665 polyVN1 = fma(polyVN1, z4, VN5);
2666 polyVN0 = fma(polyVN0, z4, VN4);
2667 polyVN1 = fma(polyVN1, z4, VN3);
2668 polyVN0 = fma(polyVN0, z4, VN2);
2669 polyVN1 = fma(polyVN1, z4, VN1);
2670 polyVN0 = fma(polyVN0, z4, VN0);
2671 polyVN0 = fma(polyVN1, z2, polyVN0);
2673 return polyVN0 * polyVD0;
2679 /*! \name SIMD math functions for double prec. data, single prec. accuracy
2681 * \note In some cases we do not need full double accuracy of individual
2682 * SIMD math functions, although the data is stored in double precision
2683 * SIMD registers. This might be the case for special algorithms, or
2684 * if the architecture does not support single precision.
2685 * Since the full double precision evaluation of math functions
2686 * typically require much more expensive polynomial approximations
2687 * these functions implement the algorithms used in the single precision
2688 * SIMD math functions, but they operate on double precision
2694 /*********************************************************************
2695 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
2696 *********************************************************************/
2698 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
2700 * \param x Argument that must be >0. This routine does not check arguments.
2701 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
2703 static inline SimdDouble gmx_simdcall
2704 invsqrtSingleAccuracy(SimdDouble x)
2706 SimdDouble lu = rsqrt(x);
2707 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2708 lu = rsqrtIter(lu, x);
2710 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2711 lu = rsqrtIter(lu, x);
2713 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2714 lu = rsqrtIter(lu, x);
2719 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
2721 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
2722 * Illegal values in the masked-out elements will not lead to
2723 * floating-point exceptions.
2725 * \param x Argument that must be >0 for masked-in entries
2727 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
2728 * entry was not masked, and 0.0 for masked-out entries.
2730 static inline SimdDouble
2731 maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
2733 SimdDouble lu = maskzRsqrt(x, m);
2734 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2735 lu = rsqrtIter(lu, x);
2737 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2738 lu = rsqrtIter(lu, x);
2740 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2741 lu = rsqrtIter(lu, x);
2746 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
2748 * \param x0 First set of arguments, x0 must be positive - no argument checking.
2749 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
2750 * \param[out] out0 Result 1/sqrt(x0)
2751 * \param[out] out1 Result 1/sqrt(x1)
2753 * In particular for double precision we can sometimes calculate square root
2754 * pairs slightly faster by using single precision until the very last step.
2756 static inline void gmx_simdcall
2757 invsqrtPairSingleAccuracy(SimdDouble x0, SimdDouble x1,
2758 SimdDouble *out0, SimdDouble *out1)
2760 #if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
2761 SimdFloat xf = cvtDD2F(x0, x1);
2762 SimdFloat luf = rsqrt(xf);
2763 SimdDouble lu0, lu1;
2764 // Intermediate target is single - mantissa+1 bits
2765 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2766 luf = rsqrtIter(luf, xf);
2768 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2769 luf = rsqrtIter(luf, xf);
2771 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2772 luf = rsqrtIter(luf, xf);
2774 cvtF2DD(luf, &lu0, &lu1);
2775 // We now have single-precision accuracy values in lu0/lu1
2779 *out0 = invsqrtSingleAccuracy(x0);
2780 *out1 = invsqrtSingleAccuracy(x1);
2784 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
2786 * \param x Argument that must be nonzero. This routine does not check arguments.
2787 * \return 1/x. Result is undefined if your argument was invalid.
2789 static inline SimdDouble gmx_simdcall
2790 invSingleAccuracy(SimdDouble x)
2792 SimdDouble lu = rcp(x);
2793 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2794 lu = rcpIter(lu, x);
2796 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2797 lu = rcpIter(lu, x);
2799 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2800 lu = rcpIter(lu, x);
2805 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
2807 * \param x Argument that must be nonzero for non-masked entries.
2809 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
2811 static inline SimdDouble gmx_simdcall
2812 maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
2814 SimdDouble lu = maskzRcp(x, m);
2815 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
2816 lu = rcpIter(lu, x);
2818 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2819 lu = rcpIter(lu, x);
2821 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
2822 lu = rcpIter(lu, x);
2828 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
2830 * \param x Argument that must be >=0.
2831 * \return sqrt(x). If x<float_min, the result will correctly be set to 0.
2832 * The result is undefined if the input value is negative.
2834 static inline SimdDouble gmx_simdcall
2835 sqrtSingleAccuracy(SimdDouble x)
2837 return x * maskzInvsqrtSingleAccuracy(x, SimdDouble(GMX_FLOAT_MIN) <= x);
2840 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
2842 * \param x Argument, should be >0.
2843 * \result The natural logarithm of x. Undefined if argument is invalid.
2845 static inline SimdDouble gmx_simdcall
2846 logSingleAccuracy(SimdDouble x)
2848 const SimdDouble one(1.0);
2849 const SimdDouble two(2.0);
2850 const SimdDouble sqrt2(std::sqrt(2.0));
2851 const SimdDouble corr(0.693147180559945286226764);
2852 const SimdDouble CL9(0.2371599674224853515625);
2853 const SimdDouble CL7(0.285279005765914916992188);
2854 const SimdDouble CL5(0.400005519390106201171875);
2855 const SimdDouble CL3(0.666666567325592041015625);
2856 const SimdDouble CL1(2.0);
2857 SimdDouble fexp, x2, p;
2861 x = frexp(x, &iexp);
2862 fexp = cvtI2R(iexp);
2865 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
2866 fexp = fexp - selectByMask(one, mask);
2867 x = x * blend(one, two, mask);
2869 x = (x - one) * invSingleAccuracy( x + one );
2872 p = fma(CL9, x2, CL7);
2873 p = fma(p, x2, CL5);
2874 p = fma(p, x2, CL3);
2875 p = fma(p, x2, CL1);
2876 p = fma(p, x, corr * fexp);
2881 /*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
2883 * \param x Argument.
2884 * \result 2^x. Undefined if input argument caused overflow.
2886 static inline SimdDouble gmx_simdcall
2887 exp2SingleAccuracy(SimdDouble x)
2889 // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
2890 const SimdDouble arglimit = SimdDouble(126.0);
2891 const SimdDouble CC6(0.0001534581200287996416911311);
2892 const SimdDouble CC5(0.001339993121934088894618990);
2893 const SimdDouble CC4(0.009618488957115180159497841);
2894 const SimdDouble CC3(0.05550328776964726865751735);
2895 const SimdDouble CC2(0.2402264689063408646490722);
2896 const SimdDouble CC1(0.6931472057372680777553816);
2897 const SimdDouble one(1.0);
2901 SimdDBool valuemask;
2906 valuemask = (abs(x) <= arglimit);
2909 p = fma(CC6, x, CC5);
2916 x = selectByMask(x, valuemask);
2921 /*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
2923 * \param x Argument.
2924 * \result exp(x). Undefined if input argument caused overflow.
2926 static inline SimdDouble gmx_simdcall
2927 expSingleAccuracy(SimdDouble x)
2929 const SimdDouble argscale(1.44269504088896341);
2930 // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127
2931 const SimdDouble arglimit(126.0);
2932 const SimdDouble invargscale(-0.69314718055994528623);
2933 const SimdDouble CC4(0.00136324646882712841033936);
2934 const SimdDouble CC3(0.00836596917361021041870117);
2935 const SimdDouble CC2(0.0416710823774337768554688);
2936 const SimdDouble CC1(0.166665524244308471679688);
2937 const SimdDouble CC0(0.499999850988388061523438);
2938 const SimdDouble one(1.0);
2941 SimdDBool valuemask;
2946 intpart = round(y); // use same rounding algorithm here
2947 valuemask = (abs(y) <= arglimit);
2949 // Extended precision arithmetics not needed since
2950 // we have double precision and only need single accuracy.
2951 x = fma(invargscale, intpart, x);
2953 p = fma(CC4, x, CC3);
2960 x = selectByMask(x, valuemask);
2964 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
2966 * \param x The value to calculate erf(x) for.
2969 * This routine achieves very close to single precision, but we do not care about
2970 * the last bit or the subnormal result range.
2972 static inline SimdDouble gmx_simdcall
2973 erfSingleAccuracy(SimdDouble x)
2975 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
2976 const SimdDouble CA6(7.853861353153693e-5);
2977 const SimdDouble CA5(-8.010193625184903e-4);
2978 const SimdDouble CA4(5.188327685732524e-3);
2979 const SimdDouble CA3(-2.685381193529856e-2);
2980 const SimdDouble CA2(1.128358514861418e-1);
2981 const SimdDouble CA1(-3.761262582423300e-1);
2982 const SimdDouble CA0(1.128379165726710);
2983 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
2984 const SimdDouble CB9(-0.0018629930017603923);
2985 const SimdDouble CB8(0.003909821287598495);
2986 const SimdDouble CB7(-0.0052094582210355615);
2987 const SimdDouble CB6(0.005685614362160572);
2988 const SimdDouble CB5(-0.0025367682853477272);
2989 const SimdDouble CB4(-0.010199799682318782);
2990 const SimdDouble CB3(0.04369575504816542);
2991 const SimdDouble CB2(-0.11884063474674492);
2992 const SimdDouble CB1(0.2732120154030589);
2993 const SimdDouble CB0(0.42758357702025784);
2994 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
2995 const SimdDouble CC10(-0.0445555913112064);
2996 const SimdDouble CC9(0.21376355144663348);
2997 const SimdDouble CC8(-0.3473187200259257);
2998 const SimdDouble CC7(0.016690861551248114);
2999 const SimdDouble CC6(0.7560973182491192);
3000 const SimdDouble CC5(-1.2137903600145787);
3001 const SimdDouble CC4(0.8411872321232948);
3002 const SimdDouble CC3(-0.08670413896296343);
3003 const SimdDouble CC2(-0.27124782687240334);
3004 const SimdDouble CC1(-0.0007502488047806069);
3005 const SimdDouble CC0(0.5642114853803148);
3006 const SimdDouble one(1.0);
3007 const SimdDouble two(2.0);
3009 SimdDouble x2, x4, y;
3010 SimdDouble t, t2, w, w2;
3011 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3013 SimdDouble res_erf, res_erfc, res;
3014 SimdDBool mask, msk_erf;
3020 pA0 = fma(CA6, x4, CA4);
3021 pA1 = fma(CA5, x4, CA3);
3022 pA0 = fma(pA0, x4, CA2);
3023 pA1 = fma(pA1, x4, CA1);
3025 pA0 = fma(pA1, x2, pA0);
3026 // Constant term must come last for precision reasons
3033 msk_erf = (SimdDouble(0.75) <= y);
3034 t = maskzInvSingleAccuracy(y, msk_erf);
3039 expmx2 = expSingleAccuracy( -y*y );
3041 pB1 = fma(CB9, w2, CB7);
3042 pB0 = fma(CB8, w2, CB6);
3043 pB1 = fma(pB1, w2, CB5);
3044 pB0 = fma(pB0, w2, CB4);
3045 pB1 = fma(pB1, w2, CB3);
3046 pB0 = fma(pB0, w2, CB2);
3047 pB1 = fma(pB1, w2, CB1);
3048 pB0 = fma(pB0, w2, CB0);
3049 pB0 = fma(pB1, w, pB0);
3051 pC0 = fma(CC10, t2, CC8);
3052 pC1 = fma(CC9, t2, CC7);
3053 pC0 = fma(pC0, t2, CC6);
3054 pC1 = fma(pC1, t2, CC5);
3055 pC0 = fma(pC0, t2, CC4);
3056 pC1 = fma(pC1, t2, CC3);
3057 pC0 = fma(pC0, t2, CC2);
3058 pC1 = fma(pC1, t2, CC1);
3060 pC0 = fma(pC0, t2, CC0);
3061 pC0 = fma(pC1, t, pC0);
3064 // Select pB0 or pC0 for erfc()
3066 res_erfc = blend(pB0, pC0, mask);
3067 res_erfc = res_erfc * expmx2;
3069 // erfc(x<0) = 2-erfc(|x|)
3070 mask = (x < setZero());
3071 res_erfc = blend(res_erfc, two - res_erfc, mask);
3073 // Select erf() or erfc()
3074 mask = (y < SimdDouble(0.75));
3075 res = blend(one - res_erfc, res_erf, mask);
3080 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
3082 * \param x The value to calculate erfc(x) for.
3085 * This routine achieves singleprecision (bar the last bit) over most of the
3086 * input range, but for large arguments where the result is getting close
3087 * to the minimum representable numbers we accept slightly larger errors
3088 * (think results that are in the ballpark of 10^-30) since that is not
3091 static inline SimdDouble gmx_simdcall
3092 erfcSingleAccuracy(SimdDouble x)
3094 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3095 const SimdDouble CA6(7.853861353153693e-5);
3096 const SimdDouble CA5(-8.010193625184903e-4);
3097 const SimdDouble CA4(5.188327685732524e-3);
3098 const SimdDouble CA3(-2.685381193529856e-2);
3099 const SimdDouble CA2(1.128358514861418e-1);
3100 const SimdDouble CA1(-3.761262582423300e-1);
3101 const SimdDouble CA0(1.128379165726710);
3102 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3103 const SimdDouble CB9(-0.0018629930017603923);
3104 const SimdDouble CB8(0.003909821287598495);
3105 const SimdDouble CB7(-0.0052094582210355615);
3106 const SimdDouble CB6(0.005685614362160572);
3107 const SimdDouble CB5(-0.0025367682853477272);
3108 const SimdDouble CB4(-0.010199799682318782);
3109 const SimdDouble CB3(0.04369575504816542);
3110 const SimdDouble CB2(-0.11884063474674492);
3111 const SimdDouble CB1(0.2732120154030589);
3112 const SimdDouble CB0(0.42758357702025784);
3113 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3114 const SimdDouble CC10(-0.0445555913112064);
3115 const SimdDouble CC9(0.21376355144663348);
3116 const SimdDouble CC8(-0.3473187200259257);
3117 const SimdDouble CC7(0.016690861551248114);
3118 const SimdDouble CC6(0.7560973182491192);
3119 const SimdDouble CC5(-1.2137903600145787);
3120 const SimdDouble CC4(0.8411872321232948);
3121 const SimdDouble CC3(-0.08670413896296343);
3122 const SimdDouble CC2(-0.27124782687240334);
3123 const SimdDouble CC1(-0.0007502488047806069);
3124 const SimdDouble CC0(0.5642114853803148);
3125 const SimdDouble one(1.0);
3126 const SimdDouble two(2.0);
3128 SimdDouble x2, x4, y;
3129 SimdDouble t, t2, w, w2;
3130 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3132 SimdDouble res_erf, res_erfc, res;
3133 SimdDBool mask, msk_erf;
3139 pA0 = fma(CA6, x4, CA4);
3140 pA1 = fma(CA5, x4, CA3);
3141 pA0 = fma(pA0, x4, CA2);
3142 pA1 = fma(pA1, x4, CA1);
3144 pA0 = fma(pA0, x4, pA1);
3145 // Constant term must come last for precision reasons
3152 msk_erf = (SimdDouble(0.75) <= y);
3153 t = maskzInvSingleAccuracy(y, msk_erf);
3158 expmx2 = expSingleAccuracy( -y*y );
3160 pB1 = fma(CB9, w2, CB7);
3161 pB0 = fma(CB8, w2, CB6);
3162 pB1 = fma(pB1, w2, CB5);
3163 pB0 = fma(pB0, w2, CB4);
3164 pB1 = fma(pB1, w2, CB3);
3165 pB0 = fma(pB0, w2, CB2);
3166 pB1 = fma(pB1, w2, CB1);
3167 pB0 = fma(pB0, w2, CB0);
3168 pB0 = fma(pB1, w, pB0);
3170 pC0 = fma(CC10, t2, CC8);
3171 pC1 = fma(CC9, t2, CC7);
3172 pC0 = fma(pC0, t2, CC6);
3173 pC1 = fma(pC1, t2, CC5);
3174 pC0 = fma(pC0, t2, CC4);
3175 pC1 = fma(pC1, t2, CC3);
3176 pC0 = fma(pC0, t2, CC2);
3177 pC1 = fma(pC1, t2, CC1);
3179 pC0 = fma(pC0, t2, CC0);
3180 pC0 = fma(pC1, t, pC0);
3183 // Select pB0 or pC0 for erfc()
3185 res_erfc = blend(pB0, pC0, mask);
3186 res_erfc = res_erfc * expmx2;
3188 // erfc(x<0) = 2-erfc(|x|)
3189 mask = (x < setZero());
3190 res_erfc = blend(res_erfc, two - res_erfc, mask);
3192 // Select erf() or erfc()
3193 mask = (y < SimdDouble(0.75));
3194 res = blend(res_erfc, one - res_erf, mask);
3199 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
3201 * \param x The argument to evaluate sin/cos for
3202 * \param[out] sinval Sin(x)
3203 * \param[out] cosval Cos(x)
3205 static inline void gmx_simdcall
3206 sinCosSingleAccuracy(SimdDouble x, SimdDouble *sinval, SimdDouble *cosval)
3208 // Constants to subtract Pi/4*x from y while minimizing precision loss
3209 const SimdDouble argred0(2*0.78539816290140151978);
3210 const SimdDouble argred1(2*4.9604678871439933374e-10);
3211 const SimdDouble argred2(2*1.1258708853173288931e-18);
3212 const SimdDouble two_over_pi(2.0/M_PI);
3213 const SimdDouble const_sin2(-1.9515295891e-4);
3214 const SimdDouble const_sin1( 8.3321608736e-3);
3215 const SimdDouble const_sin0(-1.6666654611e-1);
3216 const SimdDouble const_cos2( 2.443315711809948e-5);
3217 const SimdDouble const_cos1(-1.388731625493765e-3);
3218 const SimdDouble const_cos0( 4.166664568298827e-2);
3220 const SimdDouble half(0.5);
3221 const SimdDouble one(1.0);
3222 SimdDouble ssign, csign;
3223 SimdDouble x2, y, z, psin, pcos, sss, ccc;
3226 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3227 const SimdDInt32 ione(1);
3228 const SimdDInt32 itwo(2);
3231 z = x * two_over_pi;
3235 mask = cvtIB2B((iy & ione) == setZero());
3236 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
3237 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
3239 const SimdDouble quarter(0.25);
3240 const SimdDouble minusquarter(-0.25);
3242 SimdDBool m1, m2, m3;
3244 /* The most obvious way to find the arguments quadrant in the unit circle
3245 * to calculate the sign is to use integer arithmetic, but that is not
3246 * present in all SIMD implementations. As an alternative, we have devised a
3247 * pure floating-point algorithm that uses truncation for argument reduction
3248 * so that we get a new value 0<=q<1 over the unit circle, and then
3249 * do floating-point comparisons with fractions. This is likely to be
3250 * slightly slower (~10%) due to the longer latencies of floating-point, so
3251 * we only use it when integer SIMD arithmetic is not present.
3255 // It is critical that half-way cases are rounded down
3256 z = fma(x, two_over_pi, half);
3260 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
3261 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
3262 * This removes the 2*Pi periodicity without using any integer arithmetic.
3263 * First check if y had the value 2 or 3, set csign if true.
3266 /* If we have logical operations we can work directly on the signbit, which
3267 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
3268 * Thus, if you are altering defines to debug alternative code paths, the
3269 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
3270 * active or inactive - you will get errors if only one is used.
3272 # if GMX_SIMD_HAVE_LOGICAL
3273 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
3274 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
3275 ssign = ssign ^ csign;
3277 ssign = copysign(SimdDouble(1.0), ssign);
3278 csign = copysign(SimdDouble(1.0), q);
3280 ssign = ssign * csign; // swap ssign if csign was set.
3282 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
3283 m1 = (q < minusquarter);
3284 m2 = (setZero() <= q);
3288 // where mask is FALSE, swap sign.
3289 csign = csign * blend(SimdDouble(-1.0), one, mask);
3291 x = fnma(y, argred0, x);
3292 x = fnma(y, argred1, x);
3293 x = fnma(y, argred2, x);
3296 psin = fma(const_sin2, x2, const_sin1);
3297 psin = fma(psin, x2, const_sin0);
3298 psin = fma(psin, x * x2, x);
3299 pcos = fma(const_cos2, x2, const_cos1);
3300 pcos = fma(pcos, x2, const_cos0);
3301 pcos = fms(pcos, x2, half);
3302 pcos = fma(pcos, x2, one);
3304 sss = blend(pcos, psin, mask);
3305 ccc = blend(psin, pcos, mask);
3306 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
3307 #if GMX_SIMD_HAVE_LOGICAL
3308 *sinval = sss ^ ssign;
3309 *cosval = ccc ^ csign;
3311 *sinval = sss * ssign;
3312 *cosval = ccc * csign;
3316 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
3318 * \param x The argument to evaluate sin for
3321 * \attention Do NOT call both sin & cos if you need both results, since each of them
3322 * will then call \ref sincos and waste a factor 2 in performance.
3324 static inline SimdDouble gmx_simdcall
3325 sinSingleAccuracy(SimdDouble x)
3328 sinCosSingleAccuracy(x, &s, &c);
3332 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
3334 * \param x The argument to evaluate cos for
3337 * \attention Do NOT call both sin & cos if you need both results, since each of them
3338 * will then call \ref sincos and waste a factor 2 in performance.
3340 static inline SimdDouble gmx_simdcall
3341 cosSingleAccuracy(SimdDouble x)
3344 sinCosSingleAccuracy(x, &s, &c);
3348 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
3350 * \param x The argument to evaluate tan for
3353 static inline SimdDouble gmx_simdcall
3354 tanSingleAccuracy(SimdDouble x)
3356 const SimdDouble argred0(2*0.78539816290140151978);
3357 const SimdDouble argred1(2*4.9604678871439933374e-10);
3358 const SimdDouble argred2(2*1.1258708853173288931e-18);
3359 const SimdDouble two_over_pi(2.0/M_PI);
3360 const SimdDouble CT6(0.009498288995810566122993911);
3361 const SimdDouble CT5(0.002895755790837379295226923);
3362 const SimdDouble CT4(0.02460087336161924491836265);
3363 const SimdDouble CT3(0.05334912882656359828045988);
3364 const SimdDouble CT2(0.1333989091464957704418495);
3365 const SimdDouble CT1(0.3333307599244198227797507);
3367 SimdDouble x2, p, y, z;
3370 #if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
3374 z = x * two_over_pi;
3377 mask = cvtIB2B((iy & ione) == ione);
3379 x = fnma(y, argred0, x);
3380 x = fnma(y, argred1, x);
3381 x = fnma(y, argred2, x);
3382 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
3384 const SimdDouble quarter(0.25);
3385 const SimdDouble half(0.5);
3386 const SimdDouble threequarter(0.75);
3388 SimdDBool m1, m2, m3;
3391 z = fma(w, two_over_pi, half);
3395 m1 = (quarter <= q);
3397 m3 = (threequarter <= q);
3400 w = fnma(y, argred0, w);
3401 w = fnma(y, argred1, w);
3402 w = fnma(y, argred2, w);
3404 w = blend(w, -w, mask);
3405 x = w * copysign( SimdDouble(1.0), x );
3408 p = fma(CT6, x2, CT5);
3409 p = fma(p, x2, CT4);
3410 p = fma(p, x2, CT3);
3411 p = fma(p, x2, CT2);
3412 p = fma(p, x2, CT1);
3413 p = fma(x2, p * x, x);
3415 p = blend( p, maskzInvSingleAccuracy(p, mask), mask);
3419 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3421 * \param x The argument to evaluate asin for
3424 static inline SimdDouble gmx_simdcall
3425 asinSingleAccuracy(SimdDouble x)
3427 const SimdDouble limitlow(1e-4);
3428 const SimdDouble half(0.5);
3429 const SimdDouble one(1.0);
3430 const SimdDouble halfpi(M_PI/2.0);
3431 const SimdDouble CC5(4.2163199048E-2);
3432 const SimdDouble CC4(2.4181311049E-2);
3433 const SimdDouble CC3(4.5470025998E-2);
3434 const SimdDouble CC2(7.4953002686E-2);
3435 const SimdDouble CC1(1.6666752422E-1);
3437 SimdDouble z, z1, z2, q, q1, q2;
3439 SimdDBool mask, mask2;
3442 mask = (half < xabs);
3443 z1 = half * (one - xabs);
3444 mask2 = (xabs < one);
3445 q1 = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
3448 z = blend(z2, z1, mask);
3449 q = blend(q2, q1, mask);
3452 pA = fma(CC5, z2, CC3);
3453 pB = fma(CC4, z2, CC2);
3454 pA = fma(pA, z2, CC1);
3456 z = fma(pB, z2, pA);
3460 z = blend(z, q2, mask);
3462 mask = (limitlow < xabs);
3463 z = blend( xabs, z, mask );
3469 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
3471 * \param x The argument to evaluate acos for
3474 static inline SimdDouble gmx_simdcall
3475 acosSingleAccuracy(SimdDouble x)
3477 const SimdDouble one(1.0);
3478 const SimdDouble half(0.5);
3479 const SimdDouble pi(M_PI);
3480 const SimdDouble halfpi(M_PI/2.0);
3482 SimdDouble z, z1, z2, z3;
3483 SimdDBool mask1, mask2, mask3;
3486 mask1 = (half < xabs);
3487 mask2 = (setZero() < x);
3489 z = half * (one - xabs);
3490 mask3 = (xabs < one);
3491 z = z * maskzInvsqrtSingleAccuracy(z, mask3);
3492 z = blend(x, z, mask1);
3493 z = asinSingleAccuracy(z);
3498 z = blend(z1, z2, mask2);
3499 z = blend(z3, z, mask1);
3504 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
3506 * \param x The argument to evaluate atan for
3507 * \result Atan(x), same argument/value range as standard math library.
3509 static inline SimdDouble gmx_simdcall
3510 atanSingleAccuracy(SimdDouble x)
3512 const SimdDouble halfpi(M_PI/2);
3513 const SimdDouble CA17(0.002823638962581753730774);
3514 const SimdDouble CA15(-0.01595690287649631500244);
3515 const SimdDouble CA13(0.04250498861074447631836);
3516 const SimdDouble CA11(-0.07489009201526641845703);
3517 const SimdDouble CA9(0.1063479334115982055664);
3518 const SimdDouble CA7(-0.1420273631811141967773);
3519 const SimdDouble CA5(0.1999269574880599975585);
3520 const SimdDouble CA3(-0.3333310186862945556640);
3521 SimdDouble x2, x3, x4, pA, pB;
3522 SimdDBool mask, mask2;
3524 mask = (x < setZero());
3526 mask2 = (SimdDouble(1.0) < x);
3527 x = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
3532 pA = fma(CA17, x4, CA13);
3533 pB = fma(CA15, x4, CA11);
3534 pA = fma(pA, x4, CA9);
3535 pB = fma(pB, x4, CA7);
3536 pA = fma(pA, x4, CA5);
3537 pB = fma(pB, x4, CA3);
3538 pA = fma(pA, x2, pB);
3539 pA = fma(pA, x3, x);
3541 pA = blend(pA, halfpi - pA, mask2);
3542 pA = blend(pA, -pA, mask);
3547 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
3549 * \param y Y component of vector, any quartile
3550 * \param x X component of vector, any quartile
3551 * \result Atan(y,x), same argument/value range as standard math library.
3553 * \note This routine should provide correct results for all finite
3554 * non-zero or positive-zero arguments. However, negative zero arguments will
3555 * be treated as positive zero, which means the return value will deviate from
3556 * the standard math library atan2(y,x) for those cases. That should not be
3557 * of any concern in Gromacs, and in particular it will not affect calculations
3558 * of angles from vectors.
3560 static inline SimdDouble gmx_simdcall
3561 atan2SingleAccuracy(SimdDouble y, SimdDouble x)
3563 const SimdDouble pi(M_PI);
3564 const SimdDouble halfpi(M_PI/2.0);
3565 SimdDouble xinv, p, aoffset;
3566 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3568 mask_xnz = x != setZero();
3569 mask_ynz = y != setZero();
3570 mask_xlt0 = (x < setZero());
3571 mask_ylt0 = (y < setZero());
3573 aoffset = selectByNotMask(halfpi, mask_xnz);
3574 aoffset = selectByMask(aoffset, mask_ynz);
3576 aoffset = blend(aoffset, pi, mask_xlt0);
3577 aoffset = blend(aoffset, -aoffset, mask_ylt0);
3579 xinv = maskzInvSingleAccuracy(x, mask_xnz);
3581 p = atanSingleAccuracy(p);
3587 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
3589 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3590 * \result Correction factor to coulomb force - see below for details.
3592 * This routine is meant to enable analytical evaluation of the
3593 * direct-space PME electrostatic force to avoid tables.
3595 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
3596 * are some problems evaluating that:
3598 * First, the error function is difficult (read: expensive) to
3599 * approxmiate accurately for intermediate to large arguments, and
3600 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
3601 * Second, we now try to avoid calculating potentials in Gromacs but
3602 * use forces directly.
3604 * We can simply things slight by noting that the PME part is really
3605 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
3607 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
3609 * The first term we already have from the inverse square root, so
3610 * that we can leave out of this routine.
3612 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
3613 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
3614 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
3617 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
3618 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
3619 * then only use even powers. This is another minor optimization, since
3620 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
3621 * the vector between the two atoms to get the vectorial force. The
3622 * fastest flops are the ones we can avoid calculating!
3624 * So, here's how it should be used:
3626 * 1. Calculate \f$r^2\f$.
3627 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
3628 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
3629 * 4. The return value is the expression:
3632 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
3635 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
3638 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
3641 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
3644 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
3647 * With a bit of math exercise you should be able to confirm that
3651 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
3654 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
3655 * and you have your force (divided by \f$r\f$). A final multiplication
3656 * with the vector connecting the two particles and you have your
3657 * vectorial force to add to the particles.
3659 * This approximation achieves an accuracy slightly lower than 1e-6; when
3660 * added to \f$1/r\f$ the error will be insignificant.
3663 static SimdDouble gmx_simdcall
3664 pmeForceCorrectionSingleAccuracy(SimdDouble z2)
3666 const SimdDouble FN6(-1.7357322914161492954e-8);
3667 const SimdDouble FN5(1.4703624142580877519e-6);
3668 const SimdDouble FN4(-0.000053401640219807709149);
3669 const SimdDouble FN3(0.0010054721316683106153);
3670 const SimdDouble FN2(-0.019278317264888380590);
3671 const SimdDouble FN1(0.069670166153766424023);
3672 const SimdDouble FN0(-0.75225204789749321333);
3674 const SimdDouble FD4(0.0011193462567257629232);
3675 const SimdDouble FD3(0.014866955030185295499);
3676 const SimdDouble FD2(0.11583842382862377919);
3677 const SimdDouble FD1(0.50736591960530292870);
3678 const SimdDouble FD0(1.0);
3681 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3685 polyFD0 = fma(FD4, z4, FD2);
3686 polyFD1 = fma(FD3, z4, FD1);
3687 polyFD0 = fma(polyFD0, z4, FD0);
3688 polyFD0 = fma(polyFD1, z2, polyFD0);
3690 polyFD0 = invSingleAccuracy(polyFD0);
3692 polyFN0 = fma(FN6, z4, FN4);
3693 polyFN1 = fma(FN5, z4, FN3);
3694 polyFN0 = fma(polyFN0, z4, FN2);
3695 polyFN1 = fma(polyFN1, z4, FN1);
3696 polyFN0 = fma(polyFN0, z4, FN0);
3697 polyFN0 = fma(polyFN1, z2, polyFN0);
3699 return polyFN0 * polyFD0;
3704 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
3706 * \param z2 \f$(r \beta)^2\f$ - see below for details.
3707 * \result Correction factor to coulomb potential - see below for details.
3709 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
3710 * as the input argument.
3712 * Here's how it should be used:
3714 * 1. Calculate \f$r^2\f$.
3715 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
3716 * 3. Evaluate this routine with z^2 as the argument.
3717 * 4. The return value is the expression:
3720 * \frac{\mbox{erf}(z)}{z}
3723 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
3726 * \frac{\mbox{erf}(r \beta)}{r}
3729 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
3730 * and you have your potential.
3732 * This approximation achieves an accuracy slightly lower than 1e-6; when
3733 * added to \f$1/r\f$ the error will be insignificant.
3735 static SimdDouble gmx_simdcall
3736 pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
3738 const SimdDouble VN6(1.9296833005951166339e-8);
3739 const SimdDouble VN5(-1.4213390571557850962e-6);
3740 const SimdDouble VN4(0.000041603292906656984871);
3741 const SimdDouble VN3(-0.00013134036773265025626);
3742 const SimdDouble VN2(0.038657983986041781264);
3743 const SimdDouble VN1(0.11285044772717598220);
3744 const SimdDouble VN0(1.1283802385263030286);
3746 const SimdDouble VD3(0.0066752224023576045451);
3747 const SimdDouble VD2(0.078647795836373922256);
3748 const SimdDouble VD1(0.43336185284710920150);
3749 const SimdDouble VD0(1.0);
3752 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3756 polyVD1 = fma(VD3, z4, VD1);
3757 polyVD0 = fma(VD2, z4, VD0);
3758 polyVD0 = fma(polyVD1, z2, polyVD0);
3760 polyVD0 = invSingleAccuracy(polyVD0);
3762 polyVN0 = fma(VN6, z4, VN4);
3763 polyVN1 = fma(VN5, z4, VN3);
3764 polyVN0 = fma(polyVN0, z4, VN2);
3765 polyVN1 = fma(polyVN1, z4, VN1);
3766 polyVN0 = fma(polyVN0, z4, VN0);
3767 polyVN0 = fma(polyVN1, z2, polyVN0);
3769 return polyVN0 * polyVD0;
3775 /*! \name SIMD4 math functions
3777 * \note Only a subset of the math functions are implemented for SIMD4.
3782 #if GMX_SIMD4_HAVE_FLOAT
3784 /*************************************************************************
3785 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3786 *************************************************************************/
3788 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
3790 * This is a low-level routine that should only be used by SIMD math routine
3791 * that evaluates the inverse square root.
3793 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
3794 * \param x The reference (starting) value x for which we want 1/sqrt(x).
3795 * \return An improved approximation with roughly twice as many bits of accuracy.
3797 static inline Simd4Float gmx_simdcall
3798 rsqrtIter(Simd4Float lu, Simd4Float x)
3800 Simd4Float tmp1 = x*lu;
3801 Simd4Float tmp2 = Simd4Float(-0.5f)*lu;
3802 tmp1 = fma(tmp1, lu, Simd4Float(-3.0f));
3806 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
3808 * \param x Argument that must be >0. This routine does not check arguments.
3809 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3811 static inline Simd4Float gmx_simdcall
3812 invsqrt(Simd4Float x)
3814 Simd4Float lu = rsqrt(x);
3815 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3816 lu = rsqrtIter(lu, x);
3818 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3819 lu = rsqrtIter(lu, x);
3821 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3822 lu = rsqrtIter(lu, x);
3828 #endif // GMX_SIMD4_HAVE_FLOAT
3832 #if GMX_SIMD4_HAVE_DOUBLE
3833 /*************************************************************************
3834 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
3835 *************************************************************************/
3837 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
3839 * This is a low-level routine that should only be used by SIMD math routine
3840 * that evaluates the inverse square root.
3842 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
3843 * \param x The reference (starting) value x for which we want 1/sqrt(x).
3844 * \return An improved approximation with roughly twice as many bits of accuracy.
3846 static inline Simd4Double gmx_simdcall
3847 rsqrtIter(Simd4Double lu, Simd4Double x)
3849 Simd4Double tmp1 = x*lu;
3850 Simd4Double tmp2 = Simd4Double(-0.5f)*lu;
3851 tmp1 = fma(tmp1, lu, Simd4Double(-3.0f));
3855 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
3857 * \param x Argument that must be >0. This routine does not check arguments.
3858 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3860 static inline Simd4Double gmx_simdcall
3861 invsqrt(Simd4Double x)
3863 Simd4Double lu = rsqrt(x);
3864 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
3865 lu = rsqrtIter(lu, x);
3867 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
3868 lu = rsqrtIter(lu, x);
3870 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
3871 lu = rsqrtIter(lu, x);
3873 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
3874 lu = rsqrtIter(lu, x);
3880 /**********************************************************************
3881 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3882 **********************************************************************/
3884 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
3886 * \param x Argument that must be >0. This routine does not check arguments.
3887 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3889 static inline Simd4Double gmx_simdcall
3890 invsqrtSingleAccuracy(Simd4Double x)
3892 Simd4Double lu = rsqrt(x);
3893 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3894 lu = rsqrtIter(lu, x);
3896 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3897 lu = rsqrtIter(lu, x);
3899 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3900 lu = rsqrtIter(lu, x);
3907 #endif // GMX_SIMD4_HAVE_DOUBLE
3911 #if GMX_SIMD_HAVE_FLOAT
3912 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
3914 * \param x Argument that must be >0. This routine does not check arguments.
3915 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3917 static inline SimdFloat gmx_simdcall
3918 invsqrtSingleAccuracy(SimdFloat x)
3923 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
3925 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3926 * Illegal values in the masked-out elements will not lead to
3927 * floating-point exceptions.
3929 * \param x Argument that must be >0 for masked-in entries
3931 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
3932 * entry was not masked, and 0.0 for masked-out entries.
3934 static inline SimdFloat
3935 maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
3937 return maskzInvsqrt(x, m);
3940 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
3942 * \param x0 First set of arguments, x0 must be positive - no argument checking.
3943 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
3944 * \param[out] out0 Result 1/sqrt(x0)
3945 * \param[out] out1 Result 1/sqrt(x1)
3947 * In particular for double precision we can sometimes calculate square root
3948 * pairs slightly faster by using single precision until the very last step.
3950 static inline void gmx_simdcall
3951 invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1,
3952 SimdFloat *out0, SimdFloat *out1)
3954 return invsqrtPair(x0, x1, out0, out1);
3957 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
3959 * \param x Argument that must be nonzero. This routine does not check arguments.
3960 * \return 1/x. Result is undefined if your argument was invalid.
3962 static inline SimdFloat gmx_simdcall
3963 invSingleAccuracy(SimdFloat x)
3969 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
3971 * \param x Argument that must be nonzero for non-masked entries.
3973 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3975 static inline SimdFloat
3976 maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
3978 return maskzInv(x, m);
3981 /*! \brief Calculate sqrt(x) for SIMD float, only targeting single accuracy.
3983 * \param x Argument that must be >=0.
3984 * \return sqrt(x). If x=0, the result will correctly be set to 0.
3985 * The result is undefined if the input value is negative.
3987 static inline SimdFloat gmx_simdcall
3988 sqrtSingleAccuracy(SimdFloat x)
3993 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
3995 * \param x Argument, should be >0.
3996 * \result The natural logarithm of x. Undefined if argument is invalid.
3998 static inline SimdFloat gmx_simdcall
3999 logSingleAccuracy(SimdFloat x)
4004 /*! \brief SIMD float 2^x, only targeting single accuracy.
4006 * \param x Argument.
4007 * \result 2^x. Undefined if input argument caused overflow.
4009 static inline SimdFloat gmx_simdcall
4010 exp2SingleAccuracy(SimdFloat x)
4015 /*! \brief SIMD float e^x, only targeting single accuracy.
4017 * \param x Argument.
4018 * \result exp(x). Undefined if input argument caused overflow.
4020 static inline SimdFloat gmx_simdcall
4021 expSingleAccuracy(SimdFloat x)
4026 /*! \brief SIMD float erf(x), only targeting single accuracy.
4028 * \param x The value to calculate erf(x) for.
4031 * This routine achieves very close to single precision, but we do not care about
4032 * the last bit or the subnormal result range.
4034 static inline SimdFloat gmx_simdcall
4035 erfSingleAccuracy(SimdFloat x)
4040 /*! \brief SIMD float erfc(x), only targeting single accuracy.
4042 * \param x The value to calculate erfc(x) for.
4045 * This routine achieves singleprecision (bar the last bit) over most of the
4046 * input range, but for large arguments where the result is getting close
4047 * to the minimum representable numbers we accept slightly larger errors
4048 * (think results that are in the ballpark of 10^-30) since that is not
4051 static inline SimdFloat gmx_simdcall
4052 erfcSingleAccuracy(SimdFloat x)
4057 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
4059 * \param x The argument to evaluate sin/cos for
4060 * \param[out] sinval Sin(x)
4061 * \param[out] cosval Cos(x)
4063 static inline void gmx_simdcall
4064 sinCosSingleAccuracy(SimdFloat x, SimdFloat *sinval, SimdFloat *cosval)
4066 sincos(x, sinval, cosval);
4069 /*! \brief SIMD float sin(x), only targeting single accuracy.
4071 * \param x The argument to evaluate sin for
4074 * \attention Do NOT call both sin & cos if you need both results, since each of them
4075 * will then call \ref sincos and waste a factor 2 in performance.
4077 static inline SimdFloat gmx_simdcall
4078 sinSingleAccuracy(SimdFloat x)
4083 /*! \brief SIMD float cos(x), only targeting single accuracy.
4085 * \param x The argument to evaluate cos for
4088 * \attention Do NOT call both sin & cos if you need both results, since each of them
4089 * will then call \ref sincos and waste a factor 2 in performance.
4091 static inline SimdFloat gmx_simdcall
4092 cosSingleAccuracy(SimdFloat x)
4097 /*! \brief SIMD float tan(x), only targeting single accuracy.
4099 * \param x The argument to evaluate tan for
4102 static inline SimdFloat gmx_simdcall
4103 tanSingleAccuracy(SimdFloat x)
4108 /*! \brief SIMD float asin(x), only targeting single accuracy.
4110 * \param x The argument to evaluate asin for
4113 static inline SimdFloat gmx_simdcall
4114 asinSingleAccuracy(SimdFloat x)
4119 /*! \brief SIMD float acos(x), only targeting single accuracy.
4121 * \param x The argument to evaluate acos for
4124 static inline SimdFloat gmx_simdcall
4125 acosSingleAccuracy(SimdFloat x)
4130 /*! \brief SIMD float atan(x), only targeting single accuracy.
4132 * \param x The argument to evaluate atan for
4133 * \result Atan(x), same argument/value range as standard math library.
4135 static inline SimdFloat gmx_simdcall
4136 atanSingleAccuracy(SimdFloat x)
4141 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
4143 * \param y Y component of vector, any quartile
4144 * \param x X component of vector, any quartile
4145 * \result Atan(y,x), same argument/value range as standard math library.
4147 * \note This routine should provide correct results for all finite
4148 * non-zero or positive-zero arguments. However, negative zero arguments will
4149 * be treated as positive zero, which means the return value will deviate from
4150 * the standard math library atan2(y,x) for those cases. That should not be
4151 * of any concern in Gromacs, and in particular it will not affect calculations
4152 * of angles from vectors.
4154 static inline SimdFloat gmx_simdcall
4155 atan2SingleAccuracy(SimdFloat y, SimdFloat x)
4160 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
4162 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4163 * \result Correction factor to coulomb force.
4165 static inline SimdFloat gmx_simdcall
4166 pmeForceCorrectionSingleAccuracy(SimdFloat z2)
4168 return pmeForceCorrection(z2);
4171 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
4173 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
4174 * \result Correction factor to coulomb force.
4176 static inline SimdFloat gmx_simdcall
4177 pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
4179 return pmePotentialCorrection(z2);
4181 #endif // GMX_SIMD_HAVE_FLOAT
4183 #if GMX_SIMD4_HAVE_FLOAT
4184 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
4186 * \param x Argument that must be >0. This routine does not check arguments.
4187 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4189 static inline Simd4Float gmx_simdcall
4190 invsqrtSingleAccuracy(Simd4Float x)
4194 #endif // GMX_SIMD4_HAVE_FLOAT
4196 /*! \} end of addtogroup module_simd */
4197 /*! \endcond end of condition libabl */
4203 #endif // GMX_SIMD_SIMD_MATH_H