2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H
36 #define GMX_SIMD_SIMD_MATH_H
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
64 #include "gromacs/math/utilities.h"
65 #include "gromacs/simd/simd.h"
66 #include "gromacs/utility/basedefinitions.h"
67 #include "gromacs/utility/real.h"
75 /*! \addtogroup module_simd */
78 /*! \name Implementation accuracy settings
84 # if GMX_SIMD_HAVE_FLOAT
86 /*! \name Single precision SIMD math functions
88 * \note In most cases you should use the real-precision functions instead.
92 /****************************************
93 * SINGLE PRECISION SIMD MATH FUNCTIONS *
94 ****************************************/
96 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_FLOAT
97 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
99 * \param x Values to set sign for
100 * \param y Values used to set sign
101 * \return Magnitude of x, sign of y
103 static inline SimdFloat gmx_simdcall copysign(SimdFloat x, SimdFloat y)
105 # if GMX_SIMD_HAVE_LOGICAL
106 return abs(x) | (SimdFloat(GMX_FLOAT_NEGZERO) & y);
108 return blend(abs(x), -abs(x), y < setZero());
113 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_FLOAT
114 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
116 * This is a low-level routine that should only be used by SIMD math routine
117 * that evaluates the inverse square root.
119 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
120 * \param x The reference (starting) value x for which we want 1/sqrt(x).
121 * \return An improved approximation with roughly twice as many bits of accuracy.
123 static inline SimdFloat gmx_simdcall rsqrtIter(SimdFloat lu, SimdFloat x)
125 SimdFloat tmp1 = x * lu;
126 SimdFloat tmp2 = SimdFloat(-0.5F) * lu;
127 tmp1 = fma(tmp1, lu, SimdFloat(-3.0F));
132 /*! \brief Calculate 1/sqrt(x) for SIMD float.
134 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
135 * GMX_FLOAT_MAX, i.e. within the range of single precision.
136 * For the single precision implementation this is obviously always
137 * true for positive values, but for double precision it adds an
138 * extra restriction since the first lookup step might have to be
139 * performed in single precision on some architectures. Note that the
140 * responsibility for checking falls on you - this routine does not
143 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
145 static inline SimdFloat gmx_simdcall invsqrt(SimdFloat x)
147 SimdFloat lu = rsqrt(x);
148 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
149 lu = rsqrtIter(lu, x);
151 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
152 lu = rsqrtIter(lu, x);
154 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
155 lu = rsqrtIter(lu, x);
160 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
162 * \param x0 First set of arguments, x0 must be in single range (see below).
163 * \param x1 Second set of arguments, x1 must be in single range (see below).
164 * \param[out] out0 Result 1/sqrt(x0)
165 * \param[out] out1 Result 1/sqrt(x1)
167 * In particular for double precision we can sometimes calculate square root
168 * pairs slightly faster by using single precision until the very last step.
170 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
171 * GMX_FLOAT_MAX, i.e. within the range of single precision.
172 * For the single precision implementation this is obviously always
173 * true for positive values, but for double precision it adds an
174 * extra restriction since the first lookup step might have to be
175 * performed in single precision on some architectures. Note that the
176 * responsibility for checking falls on you - this routine does not
179 static inline void gmx_simdcall invsqrtPair(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
185 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_FLOAT
186 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
188 * This is a low-level routine that should only be used by SIMD math routine
189 * that evaluates the reciprocal.
191 * \param lu Approximation of 1/x, typically obtained from lookup.
192 * \param x The reference (starting) value x for which we want 1/x.
193 * \return An improved approximation with roughly twice as many bits of accuracy.
195 static inline SimdFloat gmx_simdcall rcpIter(SimdFloat lu, SimdFloat x)
197 return lu * fnma(lu, x, SimdFloat(2.0F));
201 /*! \brief Calculate 1/x for SIMD float.
203 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
204 * GMX_FLOAT_MAX, i.e. within the range of single precision.
205 * For the single precision implementation this is obviously always
206 * true for positive values, but for double precision it adds an
207 * extra restriction since the first lookup step might have to be
208 * performed in single precision on some architectures. Note that the
209 * responsibility for checking falls on you - this routine does not
212 * \return 1/x. Result is undefined if your argument was invalid.
214 static inline SimdFloat gmx_simdcall inv(SimdFloat x)
216 SimdFloat lu = rcp(x);
217 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
220 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
223 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
229 /*! \brief Division for SIMD floats
231 * \param nom Nominator
232 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
233 * For single precision this is equivalent to a nonzero argument,
234 * but in double precision it adds an extra restriction since
235 * the first lookup step might have to be performed in single
236 * precision on some architectures. Note that the responsibility
237 * for checking falls on you - this routine does not check arguments.
241 * \note This function does not use any masking to avoid problems with
242 * zero values in the denominator.
244 static inline SimdFloat gmx_simdcall operator/(SimdFloat nom, SimdFloat denom)
246 return nom * inv(denom);
249 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
251 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
252 * Illegal values in the masked-out elements will not lead to
253 * floating-point exceptions.
255 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
256 * GMX_FLOAT_MAX for masked-in entries.
257 * See \ref invsqrt for the discussion about argument restrictions.
259 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
260 * entry was not masked, and 0.0 for masked-out entries.
262 static inline SimdFloat maskzInvsqrt(SimdFloat x, SimdFBool m)
264 SimdFloat lu = maskzRsqrt(x, m);
265 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
266 lu = rsqrtIter(lu, x);
268 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
269 lu = rsqrtIter(lu, x);
271 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
272 lu = rsqrtIter(lu, x);
277 /*! \brief Calculate 1/x for SIMD float, masked version.
279 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
280 * GMX_FLOAT_MAX for masked-in entries.
281 * See \ref invsqrt for the discussion about argument restrictions.
283 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
285 static inline SimdFloat gmx_simdcall maskzInv(SimdFloat x, SimdFBool m)
287 SimdFloat lu = maskzRcp(x, m);
288 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
291 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
294 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
300 /*! \brief Calculate sqrt(x) for SIMD floats
302 * \tparam opt By default, this function checks if the input value is 0.0
303 * and masks this to return the correct result. If you are certain
304 * your argument will never be zero, and you know you need to save
305 * every single cycle you can, you can alternatively call the
306 * function as sqrt<MathOptimization::Unsafe>(x).
308 * \param x Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
309 * lookup step often has to be implemented in single precision.
310 * Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
311 * result, even in double precision. If you are using the unsafe
312 * math optimization parameter, the argument must be in the range
313 * GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
315 * \return sqrt(x). The result is undefined if the input value does not fall
316 * in the allowed range specified for the argument.
318 template<MathOptimization opt = MathOptimization::Safe>
319 static inline SimdFloat gmx_simdcall sqrt(SimdFloat x)
321 if (opt == MathOptimization::Safe)
323 SimdFloat res = maskzInvsqrt(x, setZero() < x);
328 return x * invsqrt(x);
332 /*! \brief Cube root for SIMD floats
334 * \param x Argument to calculate cube root of. Can be negative or zero,
335 * but NaN or Inf values are not supported. Denormal values will
337 * \return Cube root of x.
339 static inline SimdFloat gmx_simdcall cbrt(SimdFloat x)
341 const SimdFloat signBit(GMX_FLOAT_NEGZERO);
342 const SimdFloat minFloat(std::numeric_limits<float>::min());
343 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
344 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
345 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
346 // division mixed with FP, we let the divided value (42) be the original constant.
347 const std::int32_t offsetDiv3(42);
348 const SimdFloat c2(-0.191502161678719066F);
349 const SimdFloat c1(0.697570460207922770F);
350 const SimdFloat c0(0.492659620528969547F);
351 const SimdFloat one(1.0F);
352 const SimdFloat two(2.0F);
353 const SimdFloat three(3.0F);
354 const SimdFloat oneThird(1.0F / 3.0F);
355 const SimdFloat cbrt2(1.2599210498948731648F);
356 const SimdFloat sqrCbrt2(1.5874010519681994748F);
358 // To calculate cbrt(x) we first take the absolute value of x but save the sign,
359 // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
361 // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
362 // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
363 // The cube root can the be evaluated by calculating the cube root of the fraction
364 // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or
365 // +/-2^(2/3), and then we load this into a new IEEE754 fp number with the exponent 2^n, where
366 // n is the integer part of the original exponent divided by 3.
368 SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
369 SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
370 SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
373 SimdFloat y = frexp(xAbs, &exponent);
374 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
375 // by first using a polynomial and then evaluating
376 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
377 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
378 SimdFloat z = fma(fma(y, c2, c1), y, c0);
379 SimdFloat w = z * z * z;
380 SimdFloat nom = z * fma(two, y, w);
381 SimdFloat invDenom = inv(fma(two, w, y));
383 // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
384 // division converted to multiplication... but we can't do that since our SIMD layer cannot
385 // assume the presence of integer shift operations!
386 // However, when I first worked with the integer algorithm I still came up with a neat
387 // optimization, so I'll describe the full algorithm here in case we ever want to use it
390 // Our dividend is signed, which is a complication, but let's consider the unsigned case
391 // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
392 // our dividend is less than 16 bits (exponent range) we can accomplish this by
393 // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
394 // right-shifting by 16 bits to divide out the 2^16 part.
395 // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
396 // To handle the signed exponent one alternative would be to take absolute values, saving
397 // signs, etc - but that gets a bit complicated with 2-complement integers.
398 // Instead, we remember that we don't really want the exact division per se - what we're
399 // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
400 // we require that m must be positive (fewer cases to handle) instead of having n as the
402 // To handle this we start by adding 127 to the exponent. This value corresponds to the
403 // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
404 // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
405 // and then multiply by 21845 to get a division result offsetExpDiv3.
406 // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
407 // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
408 // the extra 1 corrects for the value we added to the exponent to get correct division.
409 // This remainder would have the value 0,1, or 2, but since we only use it to select
410 // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
412 // OK; end of long detour. Here's how we actually do it in our implementation by using
413 // floating-point for the exponent instead to avoid needing integer shifts:
415 // 1) Convert the exponent (obtained from frexp) to a float
416 // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
417 // do floating-point division instead of our integer hack, so it's the exponent bias-1, or
418 // the largest exponent minus 2.
419 // 3) Divide the float by 3 by multiplying with 1/3
420 // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
421 // combination with floating-point, because many integers cannot be represented exactly in
422 // floating point, and if we are just epsilon below the result might be truncated to a lower
423 // integer. I have not observed this on x86, but to have a safety margin we can add a small
424 // fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
425 // We can even save this extra floating-point addition by adding a small fraction (0.1) when
426 // we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
427 // 5) Get the remainder part by subtracting the truncated floating-point part.
428 // Here too we will have a plain division, so the remainder is a strict modulus
429 // and will have the values 0, 1 or 2.
431 // Before worrying about the few wasted cycles due to longer fp latency, this has the
432 // additional advantage that we don't use a single integer operation, so the algorithm
433 // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
435 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
436 SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
438 SimdFloat offsetExpDiv3 =
439 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
441 SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offsetDiv3)));
443 SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
445 // If remainder is 0 we should just have the factor 1.0,
446 // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
447 SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
448 // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
449 factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
451 // Assemble the non-signed fraction, and add the sign back by xor
452 SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
453 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
454 SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
459 /*! \brief Inverse cube root for SIMD floats
461 * \param x Argument to calculate cube root of. Can be positive or
462 * negative, but the magnitude cannot be lower than
463 * the smallest normal number.
464 * \return Cube root of x. Undefined for values that don't
465 * fulfill the restriction of abs(x) > minFloat.
467 static inline SimdFloat gmx_simdcall invcbrt(SimdFloat x)
469 const SimdFloat signBit(GMX_FLOAT_NEGZERO);
470 const SimdFloat minFloat(std::numeric_limits<float>::min());
471 // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
472 // negative exponent from frexp() is -126, we can subtract one more unit to get 126
473 // as offset, which is divisible by 3 (result 42). To avoid clang warnings about fragile integer
474 // division mixed with FP, we let the divided value (42) be the original constant.
475 const std::int32_t offsetDiv3(42);
476 const SimdFloat c2(-0.191502161678719066F);
477 const SimdFloat c1(0.697570460207922770F);
478 const SimdFloat c0(0.492659620528969547F);
479 const SimdFloat one(1.0F);
480 const SimdFloat two(2.0F);
481 const SimdFloat three(3.0F);
482 const SimdFloat oneThird(1.0F / 3.0F);
483 const SimdFloat invCbrt2(1.0F / 1.2599210498948731648F);
484 const SimdFloat invSqrCbrt2(1.0F / 1.5874010519681994748F);
486 // We use pretty much exactly the same implementation as for cbrt(x),
487 // but to compute the inverse we swap the nominator/denominator
488 // in the quotient, and also swap the sign of the exponent parts.
490 SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
491 SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
494 SimdFloat y = frexp(xAbs, &exponent);
495 // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
496 // by first using a polynomial and then evaluating
497 // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
498 // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
499 SimdFloat z = fma(fma(y, c2, c1), y, c0);
500 SimdFloat w = z * z * z;
501 SimdFloat nom = fma(two, w, y);
502 SimdFloat invDenom = inv(z * fma(two, y, w));
504 // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
505 SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(3 * offsetDiv3) + 0.1);
506 SimdFloat offsetExpDiv3 =
507 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
509 // We should swap the sign here, so we change order of the terms in the subtraction
510 SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offsetDiv3)) - offsetExpDiv3);
512 // Swap sign here too, so remainder is either 0, -1 or -2
513 SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
515 // If remainder is 0 we should just have the factor 1.0,
516 // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
517 SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5));
518 // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
519 factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
521 // Assemble the non-signed fraction, and add the sign back by xor
522 SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
523 // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
524 SimdFloat result = ldexp(fraction, expDiv3);
529 /*! \brief SIMD float log2(x). This is the base-2 logarithm.
531 * \param x Argument, should be >0.
532 * \result The base-2 logarithm of x. Undefined if argument is invalid.
534 static inline SimdFloat gmx_simdcall log2(SimdFloat x)
536 // This implementation computes log2 by
537 // 1) Extracting the exponent and adding it to...
538 // 2) A 9th-order minimax approximation using only odd
539 // terms of (x-1)/(x+1), where x is the mantissa.
541 # if GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
542 // Just rescale if native log2() is not present, but log() is.
543 return log(x) * SimdFloat(std::log2(std::exp(1.0)));
545 const SimdFloat one(1.0F);
546 const SimdFloat two(2.0F);
547 const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
548 const SimdFloat CL9(0.342149508897807708152F);
549 const SimdFloat CL7(0.411570606888219447939F);
550 const SimdFloat CL5(0.577085979152320294183F);
551 const SimdFloat CL3(0.961796550607099898222F);
552 const SimdFloat CL1(2.885390081777926774009F);
553 SimdFloat fExp, x2, p;
561 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
562 fExp = fExp - selectByMask(one, m);
563 x = x * blend(one, two, m);
565 x = (x - one) * inv(x + one);
568 p = fma(CL9, x2, CL7);
578 # if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
579 /*! \brief SIMD float log(x). This is the natural logarithm.
581 * \param x Argument, should be >0.
582 * \result The natural logarithm of x. Undefined if argument is invalid.
584 static inline SimdFloat gmx_simdcall log(SimdFloat x)
586 const SimdFloat one(1.0F);
587 const SimdFloat two(2.0F);
588 const SimdFloat invsqrt2(1.0F / std::sqrt(2.0F));
589 const SimdFloat corr(0.693147180559945286226764F);
590 const SimdFloat CL9(0.2371599674224853515625F);
591 const SimdFloat CL7(0.285279005765914916992188F);
592 const SimdFloat CL5(0.400005519390106201171875F);
593 const SimdFloat CL3(0.666666567325592041015625F);
594 const SimdFloat CL1(2.0F);
595 SimdFloat fExp, x2, p;
603 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
604 fExp = fExp - selectByMask(one, m);
605 x = x * blend(one, two, m);
607 x = (x - one) * inv(x + one);
610 p = fma(CL9, x2, CL7);
614 p = fma(p, x, corr * fExp);
620 # if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
621 /*! \brief SIMD float 2^x
623 * \tparam opt If this is changed from the default (safe) into the unsafe
624 * option, input values that would otherwise lead to zero-clamped
625 * results are not allowed and will lead to undefined results.
627 * \param x Argument. For the default (safe) function version this can be
628 * arbitrarily small value, but the routine might clamp the result to
629 * zero for arguments that would produce subnormal IEEE754-2008 results.
630 * This corresponds to inputs below -126 in single or -1022 in double,
631 * and it might overflow for arguments reaching 127 (single) or
632 * 1023 (double). If you enable the unsafe math optimization,
633 * very small arguments will not necessarily be zero-clamped, but
634 * can produce undefined results.
636 * \result 2^x. The result is undefined for very large arguments that cause
637 * internal floating-point overflow. If unsafe optimizations are enabled,
638 * this is also true for very small values.
640 * \note The definition range of this function is just-so-slightly smaller
641 * than the allowed IEEE exponents for many architectures. This is due
642 * to the implementation, which will hopefully improve in the future.
644 * \warning You cannot rely on this implementation returning inf for arguments
645 * that cause overflow. If you have some very large
646 * values and need to rely on getting a valid numerical output,
647 * take the minimum of your variable and the largest valid argument
648 * before calling this routine.
650 template<MathOptimization opt = MathOptimization::Safe>
651 static inline SimdFloat gmx_simdcall exp2(SimdFloat x)
653 const SimdFloat CC6(0.0001534581200287996416911311F);
654 const SimdFloat CC5(0.001339993121934088894618990F);
655 const SimdFloat CC4(0.009618488957115180159497841F);
656 const SimdFloat CC3(0.05550328776964726865751735F);
657 const SimdFloat CC2(0.2402264689063408646490722F);
658 const SimdFloat CC1(0.6931472057372680777553816F);
659 const SimdFloat one(1.0F);
665 // Large negative values are valid arguments to exp2(), so there are two
666 // things we need to account for:
667 // 1. When the exponents reaches -127, the (biased) exponent field will be
668 // zero and we can no longer multiply with it. There are special IEEE
669 // formats to handle this range, but for now we have to accept that
670 // we cannot handle those arguments. If input value becomes even more
671 // negative, it will start to loop and we would end up with invalid
672 // exponents. Thus, we need to limit or mask this.
673 // 2. For VERY large negative values, we will have problems that the
674 // subtraction to get the fractional part loses accuracy, and then we
675 // can end up with overflows in the polynomial.
677 // For now, we handle this by forwarding the math optimization setting to
678 // ldexp, where the routine will return zero for very small arguments.
680 // However, before doing that we need to make sure we do not call cvtR2I
681 // with an argument that is so negative it cannot be converted to an integer.
682 if (opt == MathOptimization::Safe)
684 x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()));
687 fexppart = ldexp<opt>(one, cvtR2I(x));
691 p = fma(CC6, x, CC5);
702 # if !GMX_SIMD_HAVE_NATIVE_EXP_FLOAT
703 /*! \brief SIMD float exp(x).
705 * In addition to scaling the argument for 2^x this routine correctly does
706 * extended precision arithmetics to improve accuracy.
708 * \tparam opt If this is changed from the default (safe) into the unsafe
709 * option, input values that would otherwise lead to zero-clamped
710 * results are not allowed and will lead to undefined results.
712 * \param x Argument. For the default (safe) function version this can be
713 * arbitrarily small value, but the routine might clamp the result to
714 * zero for arguments that would produce subnormal IEEE754-2008 results.
715 * This corresponds to input arguments reaching
716 * -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
717 * Similarly, it might overflow for arguments reaching
718 * 127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
719 * unsafe math optimizations are enabled, small input values that would
720 * result in zero-clamped output are not allowed.
722 * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
723 * depending on the underlying implementation. If unsafe optimizations
724 * are enabled, this is also true for very small values.
726 * \note The definition range of this function is just-so-slightly smaller
727 * than the allowed IEEE exponents for many architectures. This is due
728 * to the implementation, which will hopefully improve in the future.
730 * \warning You cannot rely on this implementation returning inf for arguments
731 * that cause overflow. If you have some very large
732 * values and need to rely on getting a valid numerical output,
733 * take the minimum of your variable and the largest valid argument
734 * before calling this routine.
736 template<MathOptimization opt = MathOptimization::Safe>
737 static inline SimdFloat gmx_simdcall exp(SimdFloat x)
739 const SimdFloat argscale(1.44269504088896341F);
740 const SimdFloat invargscale0(-0.693145751953125F);
741 const SimdFloat invargscale1(-1.428606765330187045e-06F);
742 const SimdFloat CC4(0.00136324646882712841033936F);
743 const SimdFloat CC3(0.00836596917361021041870117F);
744 const SimdFloat CC2(0.0416710823774337768554688F);
745 const SimdFloat CC1(0.166665524244308471679688F);
746 const SimdFloat CC0(0.499999850988388061523438F);
747 const SimdFloat one(1.0F);
752 // Large negative values are valid arguments to exp2(), so there are two
753 // things we need to account for:
754 // 1. When the exponents reaches -127, the (biased) exponent field will be
755 // zero and we can no longer multiply with it. There are special IEEE
756 // formats to handle this range, but for now we have to accept that
757 // we cannot handle those arguments. If input value becomes even more
758 // negative, it will start to loop and we would end up with invalid
759 // exponents. Thus, we need to limit or mask this.
760 // 2. For VERY large negative values, we will have problems that the
761 // subtraction to get the fractional part loses accuracy, and then we
762 // can end up with overflows in the polynomial.
764 // For now, we handle this by forwarding the math optimization setting to
765 // ldexp, where the routine will return zero for very small arguments.
767 // However, before doing that we need to make sure we do not call cvtR2I
768 // with an argument that is so negative it cannot be converted to an integer
769 // after being multiplied by argscale.
771 if (opt == MathOptimization::Safe)
773 x = max(x, SimdFloat(std::numeric_limits<std::int32_t>::lowest()) / argscale);
779 fexppart = ldexp<opt>(one, cvtR2I(y));
782 // Extended precision arithmetics
783 x = fma(invargscale0, intpart, x);
784 x = fma(invargscale1, intpart, x);
786 p = fma(CC4, x, CC3);
790 p = fma(x * x, p, x);
791 # if GMX_SIMD_HAVE_FMA
792 x = fma(p, fexppart, fexppart);
794 x = (p + one) * fexppart;
800 /*! \brief SIMD float pow(x,y)
802 * This returns x^y for SIMD values.
804 * \tparam opt If this is changed from the default (safe) into the unsafe
805 * option, there are no guarantees about correct results for x==0.
811 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
812 * depending on the underlying implementation. If unsafe optimizations
813 * are enabled, this is also true for x==0.
815 * \warning You cannot rely on this implementation returning inf for arguments
816 * that cause overflow. If you have some very large
817 * values and need to rely on getting a valid numerical output,
818 * take the minimum of your variable and the largest valid argument
819 * before calling this routine.
821 template<MathOptimization opt = MathOptimization::Safe>
822 static inline SimdFloat gmx_simdcall pow(SimdFloat x, SimdFloat y)
826 if (opt == MathOptimization::Safe)
828 xcorr = max(x, SimdFloat(std::numeric_limits<float>::min()));
835 SimdFloat result = exp2<opt>(y * log2(xcorr));
837 if (opt == MathOptimization::Safe)
839 // if x==0 and y>0 we explicitly set the result to 0.0
840 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
841 result = blend(result, setZero(), x == setZero() && setZero() < y);
848 /*! \brief SIMD float erf(x).
850 * \param x The value to calculate erf(x) for.
853 * This routine achieves very close to full precision, but we do not care about
854 * the last bit or the subnormal result range.
856 static inline SimdFloat gmx_simdcall erf(SimdFloat x)
858 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
859 const SimdFloat CA6(7.853861353153693e-5F);
860 const SimdFloat CA5(-8.010193625184903e-4F);
861 const SimdFloat CA4(5.188327685732524e-3F);
862 const SimdFloat CA3(-2.685381193529856e-2F);
863 const SimdFloat CA2(1.128358514861418e-1F);
864 const SimdFloat CA1(-3.761262582423300e-1F);
865 const SimdFloat CA0(1.128379165726710F);
866 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
867 const SimdFloat CB9(-0.0018629930017603923F);
868 const SimdFloat CB8(0.003909821287598495F);
869 const SimdFloat CB7(-0.0052094582210355615F);
870 const SimdFloat CB6(0.005685614362160572F);
871 const SimdFloat CB5(-0.0025367682853477272F);
872 const SimdFloat CB4(-0.010199799682318782F);
873 const SimdFloat CB3(0.04369575504816542F);
874 const SimdFloat CB2(-0.11884063474674492F);
875 const SimdFloat CB1(0.2732120154030589F);
876 const SimdFloat CB0(0.42758357702025784F);
877 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
878 const SimdFloat CC10(-0.0445555913112064F);
879 const SimdFloat CC9(0.21376355144663348F);
880 const SimdFloat CC8(-0.3473187200259257F);
881 const SimdFloat CC7(0.016690861551248114F);
882 const SimdFloat CC6(0.7560973182491192F);
883 const SimdFloat CC5(-1.2137903600145787F);
884 const SimdFloat CC4(0.8411872321232948F);
885 const SimdFloat CC3(-0.08670413896296343F);
886 const SimdFloat CC2(-0.27124782687240334F);
887 const SimdFloat CC1(-0.0007502488047806069F);
888 const SimdFloat CC0(0.5642114853803148F);
889 const SimdFloat one(1.0F);
890 const SimdFloat two(2.0F);
893 SimdFloat t, t2, w, w2;
894 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
896 SimdFloat res_erf, res_erfc, res;
897 SimdFBool m, maskErf;
903 pA0 = fma(CA6, x4, CA4);
904 pA1 = fma(CA5, x4, CA3);
905 pA0 = fma(pA0, x4, CA2);
906 pA1 = fma(pA1, x4, CA1);
908 pA0 = fma(pA1, x2, pA0);
909 // Constant term must come last for precision reasons
916 maskErf = SimdFloat(0.75F) <= y;
917 t = maskzInv(y, maskErf);
922 // No need for a floating-point sieve here (as in erfc), since erf()
923 // will never return values that are extremely small for large args.
924 expmx2 = exp(-y * y);
926 pB1 = fma(CB9, w2, CB7);
927 pB0 = fma(CB8, w2, CB6);
928 pB1 = fma(pB1, w2, CB5);
929 pB0 = fma(pB0, w2, CB4);
930 pB1 = fma(pB1, w2, CB3);
931 pB0 = fma(pB0, w2, CB2);
932 pB1 = fma(pB1, w2, CB1);
933 pB0 = fma(pB0, w2, CB0);
934 pB0 = fma(pB1, w, pB0);
936 pC0 = fma(CC10, t2, CC8);
937 pC1 = fma(CC9, t2, CC7);
938 pC0 = fma(pC0, t2, CC6);
939 pC1 = fma(pC1, t2, CC5);
940 pC0 = fma(pC0, t2, CC4);
941 pC1 = fma(pC1, t2, CC3);
942 pC0 = fma(pC0, t2, CC2);
943 pC1 = fma(pC1, t2, CC1);
945 pC0 = fma(pC0, t2, CC0);
946 pC0 = fma(pC1, t, pC0);
949 // Select pB0 or pC0 for erfc()
951 res_erfc = blend(pB0, pC0, m);
952 res_erfc = res_erfc * expmx2;
954 // erfc(x<0) = 2-erfc(|x|)
956 res_erfc = blend(res_erfc, two - res_erfc, m);
958 // Select erf() or erfc()
959 res = blend(res_erf, one - res_erfc, maskErf);
964 /*! \brief SIMD float erfc(x).
966 * \param x The value to calculate erfc(x) for.
969 * This routine achieves full precision (bar the last bit) over most of the
970 * input range, but for large arguments where the result is getting close
971 * to the minimum representable numbers we accept slightly larger errors
972 * (think results that are in the ballpark of 10^-30 for single precision)
973 * since that is not relevant for MD.
975 static inline SimdFloat gmx_simdcall erfc(SimdFloat x)
977 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
978 const SimdFloat CA6(7.853861353153693e-5F);
979 const SimdFloat CA5(-8.010193625184903e-4F);
980 const SimdFloat CA4(5.188327685732524e-3F);
981 const SimdFloat CA3(-2.685381193529856e-2F);
982 const SimdFloat CA2(1.128358514861418e-1F);
983 const SimdFloat CA1(-3.761262582423300e-1F);
984 const SimdFloat CA0(1.128379165726710F);
985 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
986 const SimdFloat CB9(-0.0018629930017603923F);
987 const SimdFloat CB8(0.003909821287598495F);
988 const SimdFloat CB7(-0.0052094582210355615F);
989 const SimdFloat CB6(0.005685614362160572F);
990 const SimdFloat CB5(-0.0025367682853477272F);
991 const SimdFloat CB4(-0.010199799682318782F);
992 const SimdFloat CB3(0.04369575504816542F);
993 const SimdFloat CB2(-0.11884063474674492F);
994 const SimdFloat CB1(0.2732120154030589F);
995 const SimdFloat CB0(0.42758357702025784F);
996 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
997 const SimdFloat CC10(-0.0445555913112064F);
998 const SimdFloat CC9(0.21376355144663348F);
999 const SimdFloat CC8(-0.3473187200259257F);
1000 const SimdFloat CC7(0.016690861551248114F);
1001 const SimdFloat CC6(0.7560973182491192F);
1002 const SimdFloat CC5(-1.2137903600145787F);
1003 const SimdFloat CC4(0.8411872321232948F);
1004 const SimdFloat CC3(-0.08670413896296343F);
1005 const SimdFloat CC2(-0.27124782687240334F);
1006 const SimdFloat CC1(-0.0007502488047806069F);
1007 const SimdFloat CC0(0.5642114853803148F);
1008 // Coefficients for expansion of exp(x) in [0,0.1]
1009 // CD0 and CD1 are both 1.0, so no need to declare them separately
1010 const SimdFloat CD2(0.5000066608081202F);
1011 const SimdFloat CD3(0.1664795422874624F);
1012 const SimdFloat CD4(0.04379839977652482F);
1013 const SimdFloat one(1.0F);
1014 const SimdFloat two(2.0F);
1016 /* We need to use a small trick here, since we cannot assume all SIMD
1017 * architectures support integers, and the flag we want (0xfffff000) would
1018 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
1019 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
1020 * fp numbers, and perform a logical or. Since the expression is constant,
1021 * we can at least hope it is evaluated at compile-time.
1023 # if GMX_SIMD_HAVE_LOGICAL
1024 const SimdFloat sieve(SimdFloat(-5.965323564e+29F) | SimdFloat(7.05044434e-30F));
1026 const int isieve = 0xFFFFF000;
1027 alignas(GMX_SIMD_ALIGNMENT) float mem[GMX_SIMD_FLOAT_WIDTH];
1036 SimdFloat x2, x4, y;
1037 SimdFloat q, z, t, t2, w, w2;
1038 SimdFloat pA0, pA1, pB0, pB1, pC0, pC1;
1039 SimdFloat expmx2, corr;
1040 SimdFloat res_erf, res_erfc, res;
1041 SimdFBool m, msk_erf;
1047 pA0 = fma(CA6, x4, CA4);
1048 pA1 = fma(CA5, x4, CA3);
1049 pA0 = fma(pA0, x4, CA2);
1050 pA1 = fma(pA1, x4, CA1);
1052 pA0 = fma(pA0, x4, pA1);
1053 // Constant term must come last for precision reasons
1060 msk_erf = SimdFloat(0.75F) <= y;
1061 t = maskzInv(y, msk_erf);
1066 * We cannot simply calculate exp(-y2) directly in single precision, since
1067 * that will lose a couple of bits of precision due to the multiplication.
1068 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
1069 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
1071 * The only drawback with this is that it requires TWO separate exponential
1072 * evaluations, which would be horrible performance-wise. However, the argument
1073 * for the second exp() call is always small, so there we simply use a
1074 * low-order minimax expansion on [0,0.1].
1076 * However, this neat idea requires support for logical ops (and) on
1077 * FP numbers, which some vendors decided isn't necessary in their SIMD
1078 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
1079 * in double, but we still need memory as a backup when that is not available,
1080 * and this case is rare enough that we go directly there...
1082 # if GMX_SIMD_HAVE_LOGICAL
1086 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1089 conv.i = conv.i & isieve;
1092 z = load<SimdFloat>(mem);
1094 q = (z - y) * (z + y);
1095 corr = fma(CD4, q, CD3);
1096 corr = fma(corr, q, CD2);
1097 corr = fma(corr, q, one);
1098 corr = fma(corr, q, one);
1100 expmx2 = exp(-z * z);
1101 expmx2 = expmx2 * corr;
1103 pB1 = fma(CB9, w2, CB7);
1104 pB0 = fma(CB8, w2, CB6);
1105 pB1 = fma(pB1, w2, CB5);
1106 pB0 = fma(pB0, w2, CB4);
1107 pB1 = fma(pB1, w2, CB3);
1108 pB0 = fma(pB0, w2, CB2);
1109 pB1 = fma(pB1, w2, CB1);
1110 pB0 = fma(pB0, w2, CB0);
1111 pB0 = fma(pB1, w, pB0);
1113 pC0 = fma(CC10, t2, CC8);
1114 pC1 = fma(CC9, t2, CC7);
1115 pC0 = fma(pC0, t2, CC6);
1116 pC1 = fma(pC1, t2, CC5);
1117 pC0 = fma(pC0, t2, CC4);
1118 pC1 = fma(pC1, t2, CC3);
1119 pC0 = fma(pC0, t2, CC2);
1120 pC1 = fma(pC1, t2, CC1);
1122 pC0 = fma(pC0, t2, CC0);
1123 pC0 = fma(pC1, t, pC0);
1126 // Select pB0 or pC0 for erfc()
1128 res_erfc = blend(pB0, pC0, m);
1129 res_erfc = res_erfc * expmx2;
1131 // erfc(x<0) = 2-erfc(|x|)
1133 res_erfc = blend(res_erfc, two - res_erfc, m);
1135 // Select erf() or erfc()
1136 res = blend(one - res_erf, res_erfc, msk_erf);
1141 /*! \brief SIMD float sin \& cos.
1143 * \param x The argument to evaluate sin/cos for
1144 * \param[out] sinval Sin(x)
1145 * \param[out] cosval Cos(x)
1147 * This version achieves close to machine precision, but for very large
1148 * magnitudes of the argument we inherently begin to lose accuracy due to the
1149 * argument reduction, despite using extended precision arithmetics internally.
1151 static inline void gmx_simdcall sincos(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
1153 // Constants to subtract Pi/4*x from y while minimizing precision loss
1154 const SimdFloat argred0(-1.5703125);
1155 const SimdFloat argred1(-4.83751296997070312500e-04F);
1156 const SimdFloat argred2(-7.54953362047672271729e-08F);
1157 const SimdFloat argred3(-2.56334406825708960298e-12F);
1158 const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1159 const SimdFloat const_sin2(-1.9515295891e-4F);
1160 const SimdFloat const_sin1(8.3321608736e-3F);
1161 const SimdFloat const_sin0(-1.6666654611e-1F);
1162 const SimdFloat const_cos2(2.443315711809948e-5F);
1163 const SimdFloat const_cos1(-1.388731625493765e-3F);
1164 const SimdFloat const_cos0(4.166664568298827e-2F);
1165 const SimdFloat half(0.5F);
1166 const SimdFloat one(1.0F);
1167 SimdFloat ssign, csign;
1168 SimdFloat x2, y, z, psin, pcos, sss, ccc;
1171 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1172 const SimdFInt32 ione(1);
1173 const SimdFInt32 itwo(2);
1176 z = x * two_over_pi;
1180 m = cvtIB2B((iy & ione) == SimdFInt32(0));
1181 ssign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B((iy & itwo) == itwo));
1182 csign = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
1184 const SimdFloat quarter(0.25f);
1185 const SimdFloat minusquarter(-0.25f);
1187 SimdFBool m1, m2, m3;
1189 /* The most obvious way to find the arguments quadrant in the unit circle
1190 * to calculate the sign is to use integer arithmetic, but that is not
1191 * present in all SIMD implementations. As an alternative, we have devised a
1192 * pure floating-point algorithm that uses truncation for argument reduction
1193 * so that we get a new value 0<=q<1 over the unit circle, and then
1194 * do floating-point comparisons with fractions. This is likely to be
1195 * slightly slower (~10%) due to the longer latencies of floating-point, so
1196 * we only use it when integer SIMD arithmetic is not present.
1200 // It is critical that half-way cases are rounded down
1201 z = fma(x, two_over_pi, half);
1205 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
1206 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
1207 * This removes the 2*Pi periodicity without using any integer arithmetic.
1208 * First check if y had the value 2 or 3, set csign if true.
1211 /* If we have logical operations we can work directly on the signbit, which
1212 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
1213 * Thus, if you are altering defines to debug alternative code paths, the
1214 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
1215 * active or inactive - you will get errors if only one is used.
1217 # if GMX_SIMD_HAVE_LOGICAL
1218 ssign = ssign & SimdFloat(GMX_FLOAT_NEGZERO);
1219 csign = andNot(q, SimdFloat(GMX_FLOAT_NEGZERO));
1220 ssign = ssign ^ csign;
1222 ssign = copysign(SimdFloat(1.0f), ssign);
1223 csign = copysign(SimdFloat(1.0f), q);
1225 ssign = ssign * csign; // swap ssign if csign was set.
1227 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
1228 m1 = (q < minusquarter);
1229 m2 = (setZero() <= q);
1233 // where mask is FALSE, swap sign.
1234 csign = csign * blend(SimdFloat(-1.0f), one, m);
1236 x = fma(y, argred0, x);
1237 x = fma(y, argred1, x);
1238 x = fma(y, argred2, x);
1239 x = fma(y, argred3, x);
1242 psin = fma(const_sin2, x2, const_sin1);
1243 psin = fma(psin, x2, const_sin0);
1244 psin = fma(psin, x * x2, x);
1245 pcos = fma(const_cos2, x2, const_cos1);
1246 pcos = fma(pcos, x2, const_cos0);
1247 pcos = fms(pcos, x2, half);
1248 pcos = fma(pcos, x2, one);
1250 sss = blend(pcos, psin, m);
1251 ccc = blend(psin, pcos, m);
1252 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
1253 # if GMX_SIMD_HAVE_LOGICAL
1254 *sinval = sss ^ ssign;
1255 *cosval = ccc ^ csign;
1257 *sinval = sss * ssign;
1258 *cosval = ccc * csign;
1262 /*! \brief SIMD float sin(x).
1264 * \param x The argument to evaluate sin for
1267 * \attention Do NOT call both sin & cos if you need both results, since each of them
1268 * will then call \ref sincos and waste a factor 2 in performance.
1270 static inline SimdFloat gmx_simdcall sin(SimdFloat x)
1277 /*! \brief SIMD float cos(x).
1279 * \param x The argument to evaluate cos for
1282 * \attention Do NOT call both sin & cos if you need both results, since each of them
1283 * will then call \ref sincos and waste a factor 2 in performance.
1285 static inline SimdFloat gmx_simdcall cos(SimdFloat x)
1292 /*! \brief SIMD float tan(x).
1294 * \param x The argument to evaluate tan for
1297 static inline SimdFloat gmx_simdcall tan(SimdFloat x)
1299 const SimdFloat argred0(-1.5703125);
1300 const SimdFloat argred1(-4.83751296997070312500e-04F);
1301 const SimdFloat argred2(-7.54953362047672271729e-08F);
1302 const SimdFloat argred3(-2.56334406825708960298e-12F);
1303 const SimdFloat two_over_pi(static_cast<float>(2.0F / M_PI));
1304 const SimdFloat CT6(0.009498288995810566122993911);
1305 const SimdFloat CT5(0.002895755790837379295226923);
1306 const SimdFloat CT4(0.02460087336161924491836265);
1307 const SimdFloat CT3(0.05334912882656359828045988);
1308 const SimdFloat CT2(0.1333989091464957704418495);
1309 const SimdFloat CT1(0.3333307599244198227797507);
1311 SimdFloat x2, p, y, z;
1314 # if GMX_SIMD_HAVE_FINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
1318 z = x * two_over_pi;
1321 m = cvtIB2B((iy & ione) == ione);
1323 x = fma(y, argred0, x);
1324 x = fma(y, argred1, x);
1325 x = fma(y, argred2, x);
1326 x = fma(y, argred3, x);
1327 x = selectByMask(SimdFloat(GMX_FLOAT_NEGZERO), m) ^ x;
1329 const SimdFloat quarter(0.25f);
1330 const SimdFloat half(0.5f);
1331 const SimdFloat threequarter(0.75f);
1333 SimdFBool m1, m2, m3;
1336 z = fma(w, two_over_pi, half);
1342 m3 = threequarter <= q;
1345 w = fma(y, argred0, w);
1346 w = fma(y, argred1, w);
1347 w = fma(y, argred2, w);
1348 w = fma(y, argred3, w);
1349 w = blend(w, -w, m);
1350 x = w * copysign(SimdFloat(1.0), x);
1353 p = fma(CT6, x2, CT5);
1354 p = fma(p, x2, CT4);
1355 p = fma(p, x2, CT3);
1356 p = fma(p, x2, CT2);
1357 p = fma(p, x2, CT1);
1358 p = fma(x2 * p, x, x);
1360 p = blend(p, maskzInv(p, m), m);
1364 /*! \brief SIMD float asin(x).
1366 * \param x The argument to evaluate asin for
1369 static inline SimdFloat gmx_simdcall asin(SimdFloat x)
1371 const SimdFloat limitlow(1e-4F);
1372 const SimdFloat half(0.5F);
1373 const SimdFloat one(1.0F);
1374 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1375 const SimdFloat CC5(4.2163199048E-2F);
1376 const SimdFloat CC4(2.4181311049E-2F);
1377 const SimdFloat CC3(4.5470025998E-2F);
1378 const SimdFloat CC2(7.4953002686E-2F);
1379 const SimdFloat CC1(1.6666752422E-1F);
1381 SimdFloat z, z1, z2, q, q1, q2;
1387 z1 = half * (one - xabs);
1389 q1 = z1 * maskzInvsqrt(z1, m2);
1392 z = blend(z2, z1, m);
1393 q = blend(q2, q1, m);
1396 pA = fma(CC5, z2, CC3);
1397 pB = fma(CC4, z2, CC2);
1398 pA = fma(pA, z2, CC1);
1400 z = fma(pB, z2, pA);
1404 z = blend(z, q2, m);
1406 m = limitlow < xabs;
1407 z = blend(xabs, z, m);
1413 /*! \brief SIMD float acos(x).
1415 * \param x The argument to evaluate acos for
1418 static inline SimdFloat gmx_simdcall acos(SimdFloat x)
1420 const SimdFloat one(1.0F);
1421 const SimdFloat half(0.5F);
1422 const SimdFloat pi(static_cast<float>(M_PI));
1423 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1425 SimdFloat z, z1, z2, z3;
1426 SimdFBool m1, m2, m3;
1432 z = fnma(half, xabs, half);
1434 z = z * maskzInvsqrt(z, m3);
1435 z = blend(x, z, m1);
1441 z = blend(z1, z2, m2);
1442 z = blend(z3, z, m1);
1447 /*! \brief SIMD float asin(x).
1449 * \param x The argument to evaluate atan for
1450 * \result Atan(x), same argument/value range as standard math library.
1452 static inline SimdFloat gmx_simdcall atan(SimdFloat x)
1454 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0F));
1455 const SimdFloat CA17(0.002823638962581753730774F);
1456 const SimdFloat CA15(-0.01595690287649631500244F);
1457 const SimdFloat CA13(0.04250498861074447631836F);
1458 const SimdFloat CA11(-0.07489009201526641845703F);
1459 const SimdFloat CA9(0.1063479334115982055664F);
1460 const SimdFloat CA7(-0.1420273631811141967773F);
1461 const SimdFloat CA5(0.1999269574880599975585F);
1462 const SimdFloat CA3(-0.3333310186862945556640F);
1463 const SimdFloat one(1.0F);
1464 SimdFloat x2, x3, x4, pA, pB;
1470 x = blend(x, maskzInv(x, m2), m2);
1475 pA = fma(CA17, x4, CA13);
1476 pB = fma(CA15, x4, CA11);
1477 pA = fma(pA, x4, CA9);
1478 pB = fma(pB, x4, CA7);
1479 pA = fma(pA, x4, CA5);
1480 pB = fma(pB, x4, CA3);
1481 pA = fma(pA, x2, pB);
1482 pA = fma(pA, x3, x);
1484 pA = blend(pA, halfpi - pA, m2);
1485 pA = blend(pA, -pA, m);
1490 /*! \brief SIMD float atan2(y,x).
1492 * \param y Y component of vector, any quartile
1493 * \param x X component of vector, any quartile
1494 * \result Atan(y,x), same argument/value range as standard math library.
1496 * \note This routine should provide correct results for all finite
1497 * non-zero or positive-zero arguments. However, negative zero arguments will
1498 * be treated as positive zero, which means the return value will deviate from
1499 * the standard math library atan2(y,x) for those cases. That should not be
1500 * of any concern in Gromacs, and in particular it will not affect calculations
1501 * of angles from vectors.
1503 static inline SimdFloat gmx_simdcall atan2(SimdFloat y, SimdFloat x)
1505 const SimdFloat pi(static_cast<float>(M_PI));
1506 const SimdFloat halfpi(static_cast<float>(M_PI / 2.0));
1507 SimdFloat xinv, p, aoffset;
1508 SimdFBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
1510 mask_xnz = x != setZero();
1511 mask_ynz = y != setZero();
1512 mask_xlt0 = x < setZero();
1513 mask_ylt0 = y < setZero();
1515 aoffset = selectByNotMask(halfpi, mask_xnz);
1516 aoffset = selectByMask(aoffset, mask_ynz);
1518 aoffset = blend(aoffset, pi, mask_xlt0);
1519 aoffset = blend(aoffset, -aoffset, mask_ylt0);
1521 xinv = maskzInv(x, mask_xnz);
1529 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1531 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1532 * \result Correction factor to coulomb force - see below for details.
1534 * This routine is meant to enable analytical evaluation of the
1535 * direct-space PME electrostatic force to avoid tables.
1537 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1538 * are some problems evaluating that:
1540 * First, the error function is difficult (read: expensive) to
1541 * approxmiate accurately for intermediate to large arguments, and
1542 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1543 * Second, we now try to avoid calculating potentials in Gromacs but
1544 * use forces directly.
1546 * We can simply things slight by noting that the PME part is really
1547 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1549 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1551 * The first term we already have from the inverse square root, so
1552 * that we can leave out of this routine.
1554 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1555 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1556 * the range used for the minimax fit. Use your favorite plotting program
1557 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1559 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1560 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1561 * then only use even powers. This is another minor optimization, since
1562 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1563 * the vector between the two atoms to get the vectorial force. The
1564 * fastest flops are the ones we can avoid calculating!
1566 * So, here's how it should be used:
1568 * 1. Calculate \f$r^2\f$.
1569 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1570 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1571 * 4. The return value is the expression:
1574 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1577 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1580 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1583 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1586 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1589 * With a bit of math exercise you should be able to confirm that
1593 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1596 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1597 * and you have your force (divided by \f$r\f$). A final multiplication
1598 * with the vector connecting the two particles and you have your
1599 * vectorial force to add to the particles.
1601 * This approximation achieves an error slightly lower than 1e-6
1602 * in single precision and 1e-11 in double precision
1603 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1604 * when added to \f$1/r\f$ the error will be insignificant.
1605 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1608 static inline SimdFloat gmx_simdcall pmeForceCorrection(SimdFloat z2)
1610 const SimdFloat FN6(-1.7357322914161492954e-8F);
1611 const SimdFloat FN5(1.4703624142580877519e-6F);
1612 const SimdFloat FN4(-0.000053401640219807709149F);
1613 const SimdFloat FN3(0.0010054721316683106153F);
1614 const SimdFloat FN2(-0.019278317264888380590F);
1615 const SimdFloat FN1(0.069670166153766424023F);
1616 const SimdFloat FN0(-0.75225204789749321333F);
1618 const SimdFloat FD4(0.0011193462567257629232F);
1619 const SimdFloat FD3(0.014866955030185295499F);
1620 const SimdFloat FD2(0.11583842382862377919F);
1621 const SimdFloat FD1(0.50736591960530292870F);
1622 const SimdFloat FD0(1.0F);
1625 SimdFloat polyFN0, polyFN1, polyFD0, polyFD1;
1629 polyFD0 = fma(FD4, z4, FD2);
1630 polyFD1 = fma(FD3, z4, FD1);
1631 polyFD0 = fma(polyFD0, z4, FD0);
1632 polyFD0 = fma(polyFD1, z2, polyFD0);
1634 polyFD0 = inv(polyFD0);
1636 polyFN0 = fma(FN6, z4, FN4);
1637 polyFN1 = fma(FN5, z4, FN3);
1638 polyFN0 = fma(polyFN0, z4, FN2);
1639 polyFN1 = fma(polyFN1, z4, FN1);
1640 polyFN0 = fma(polyFN0, z4, FN0);
1641 polyFN0 = fma(polyFN1, z2, polyFN0);
1643 return polyFN0 * polyFD0;
1647 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1649 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1650 * \result Correction factor to coulomb potential - see below for details.
1652 * See \ref pmeForceCorrection for details about the approximation.
1654 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1655 * as the input argument.
1657 * Here's how it should be used:
1659 * 1. Calculate \f$r^2\f$.
1660 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1661 * 3. Evaluate this routine with z^2 as the argument.
1662 * 4. The return value is the expression:
1665 * \frac{\mbox{erf}(z)}{z}
1668 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1671 * \frac{\mbox{erf}(r \beta)}{r}
1674 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1675 * and you have your potential.
1677 * This approximation achieves an error slightly lower than 1e-6
1678 * in single precision and 4e-11 in double precision
1679 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1680 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1681 * when added to \f$1/r\f$ the error will be insignificant.
1682 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1684 static inline SimdFloat gmx_simdcall pmePotentialCorrection(SimdFloat z2)
1686 const SimdFloat VN6(1.9296833005951166339e-8F);
1687 const SimdFloat VN5(-1.4213390571557850962e-6F);
1688 const SimdFloat VN4(0.000041603292906656984871F);
1689 const SimdFloat VN3(-0.00013134036773265025626F);
1690 const SimdFloat VN2(0.038657983986041781264F);
1691 const SimdFloat VN1(0.11285044772717598220F);
1692 const SimdFloat VN0(1.1283802385263030286F);
1694 const SimdFloat VD3(0.0066752224023576045451F);
1695 const SimdFloat VD2(0.078647795836373922256F);
1696 const SimdFloat VD1(0.43336185284710920150F);
1697 const SimdFloat VD0(1.0F);
1700 SimdFloat polyVN0, polyVN1, polyVD0, polyVD1;
1704 polyVD1 = fma(VD3, z4, VD1);
1705 polyVD0 = fma(VD2, z4, VD0);
1706 polyVD0 = fma(polyVD1, z2, polyVD0);
1708 polyVD0 = inv(polyVD0);
1710 polyVN0 = fma(VN6, z4, VN4);
1711 polyVN1 = fma(VN5, z4, VN3);
1712 polyVN0 = fma(polyVN0, z4, VN2);
1713 polyVN1 = fma(polyVN1, z4, VN1);
1714 polyVN0 = fma(polyVN0, z4, VN0);
1715 polyVN0 = fma(polyVN1, z2, polyVN0);
1717 return polyVN0 * polyVD0;
1723 # if GMX_SIMD_HAVE_DOUBLE
1726 /*! \name Double precision SIMD math functions
1728 * \note In most cases you should use the real-precision functions instead.
1732 /****************************************
1733 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1734 ****************************************/
1736 # if !GMX_SIMD_HAVE_NATIVE_COPYSIGN_DOUBLE
1737 /*! \brief Composes floating point value with the magnitude of x and the sign of y.
1739 * \param x Values to set sign for
1740 * \param y Values used to set sign
1741 * \return Magnitude of x, sign of y
1743 static inline SimdDouble gmx_simdcall copysign(SimdDouble x, SimdDouble y)
1745 # if GMX_SIMD_HAVE_LOGICAL
1746 return abs(x) | (SimdDouble(GMX_DOUBLE_NEGZERO) & y);
1748 return blend(abs(x), -abs(x), (y < setZero()));
1753 # if !GMX_SIMD_HAVE_NATIVE_RSQRT_ITER_DOUBLE
1754 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1756 * This is a low-level routine that should only be used by SIMD math routine
1757 * that evaluates the inverse square root.
1759 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
1760 * \param x The reference (starting) value x for which we want 1/sqrt(x).
1761 * \return An improved approximation with roughly twice as many bits of accuracy.
1763 static inline SimdDouble gmx_simdcall rsqrtIter(SimdDouble lu, SimdDouble x)
1765 SimdDouble tmp1 = x * lu;
1766 SimdDouble tmp2 = SimdDouble(-0.5) * lu;
1767 tmp1 = fma(tmp1, lu, SimdDouble(-3.0));
1772 /*! \brief Calculate 1/sqrt(x) for SIMD double.
1774 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1775 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1776 * For the single precision implementation this is obviously always
1777 * true for positive values, but for double precision it adds an
1778 * extra restriction since the first lookup step might have to be
1779 * performed in single precision on some architectures. Note that the
1780 * responsibility for checking falls on you - this routine does not
1783 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
1785 static inline SimdDouble gmx_simdcall invsqrt(SimdDouble x)
1787 SimdDouble lu = rsqrt(x);
1788 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1789 lu = rsqrtIter(lu, x);
1791 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1792 lu = rsqrtIter(lu, x);
1794 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1795 lu = rsqrtIter(lu, x);
1797 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1798 lu = rsqrtIter(lu, x);
1803 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1805 * \param x0 First set of arguments, x0 must be in single range (see below).
1806 * \param x1 Second set of arguments, x1 must be in single range (see below).
1807 * \param[out] out0 Result 1/sqrt(x0)
1808 * \param[out] out1 Result 1/sqrt(x1)
1810 * In particular for double precision we can sometimes calculate square root
1811 * pairs slightly faster by using single precision until the very last step.
1813 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
1814 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1815 * For the single precision implementation this is obviously always
1816 * true for positive values, but for double precision it adds an
1817 * extra restriction since the first lookup step might have to be
1818 * performed in single precision on some architectures. Note that the
1819 * responsibility for checking falls on you - this routine does not
1822 static inline void gmx_simdcall invsqrtPair(SimdDouble x0, SimdDouble x1, SimdDouble* out0, SimdDouble* out1)
1824 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
1825 && (GMX_SIMD_RSQRT_BITS < 22)
1826 SimdFloat xf = cvtDD2F(x0, x1);
1827 SimdFloat luf = rsqrt(xf);
1828 SimdDouble lu0, lu1;
1829 // Intermediate target is single - mantissa+1 bits
1830 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
1831 luf = rsqrtIter(luf, xf);
1833 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1834 luf = rsqrtIter(luf, xf);
1836 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
1837 luf = rsqrtIter(luf, xf);
1839 cvtF2DD(luf, &lu0, &lu1);
1840 // Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15)
1841 # if (GMX_SIMD_ACCURACY_BITS_SINGLE < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1842 lu0 = rsqrtIter(lu0, x0);
1843 lu1 = rsqrtIter(lu1, x1);
1845 # if (GMX_SIMD_ACCURACY_BITS_SINGLE * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1846 lu0 = rsqrtIter(lu0, x0);
1847 lu1 = rsqrtIter(lu1, x1);
1852 *out0 = invsqrt(x0);
1853 *out1 = invsqrt(x1);
1857 # if !GMX_SIMD_HAVE_NATIVE_RCP_ITER_DOUBLE
1858 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1860 * This is a low-level routine that should only be used by SIMD math routine
1861 * that evaluates the reciprocal.
1863 * \param lu Approximation of 1/x, typically obtained from lookup.
1864 * \param x The reference (starting) value x for which we want 1/x.
1865 * \return An improved approximation with roughly twice as many bits of accuracy.
1867 static inline SimdDouble gmx_simdcall rcpIter(SimdDouble lu, SimdDouble x)
1869 return lu * fnma(lu, x, SimdDouble(2.0));
1873 /*! \brief Calculate 1/x for SIMD double.
1875 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1876 * GMX_FLOAT_MAX, i.e. within the range of single precision.
1877 * For the single precision implementation this is obviously always
1878 * true for positive values, but for double precision it adds an
1879 * extra restriction since the first lookup step might have to be
1880 * performed in single precision on some architectures. Note that the
1881 * responsibility for checking falls on you - this routine does not
1884 * \return 1/x. Result is undefined if your argument was invalid.
1886 static inline SimdDouble gmx_simdcall inv(SimdDouble x)
1888 SimdDouble lu = rcp(x);
1889 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1890 lu = rcpIter(lu, x);
1892 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1893 lu = rcpIter(lu, x);
1895 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1896 lu = rcpIter(lu, x);
1898 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1899 lu = rcpIter(lu, x);
1904 /*! \brief Division for SIMD doubles
1906 * \param nom Nominator
1907 * \param denom Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
1908 * For single precision this is equivalent to a nonzero argument,
1909 * but in double precision it adds an extra restriction since
1910 * the first lookup step might have to be performed in single
1911 * precision on some architectures. Note that the responsibility
1912 * for checking falls on you - this routine does not check arguments.
1916 * \note This function does not use any masking to avoid problems with
1917 * zero values in the denominator.
1919 static inline SimdDouble gmx_simdcall operator/(SimdDouble nom, SimdDouble denom)
1921 return nom * inv(denom);
1925 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1927 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
1928 * Illegal values in the masked-out elements will not lead to
1929 * floating-point exceptions.
1931 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
1932 * GMX_FLOAT_MAX for masked-in entries.
1933 * See \ref invsqrt for the discussion about argument restrictions.
1935 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
1936 * entry was not masked, and 0.0 for masked-out entries.
1938 static inline SimdDouble maskzInvsqrt(SimdDouble x, SimdDBool m)
1940 SimdDouble lu = maskzRsqrt(x, m);
1941 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1942 lu = rsqrtIter(lu, x);
1944 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1945 lu = rsqrtIter(lu, x);
1947 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1948 lu = rsqrtIter(lu, x);
1950 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1951 lu = rsqrtIter(lu, x);
1956 /*! \brief Calculate 1/x for SIMD double, masked version.
1958 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
1959 * GMX_FLOAT_MAX for masked-in entries.
1960 * See \ref invsqrt for the discussion about argument restrictions.
1962 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
1964 static inline SimdDouble gmx_simdcall maskzInv(SimdDouble x, SimdDBool m)
1966 SimdDouble lu = maskzRcp(x, m);
1967 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1968 lu = rcpIter(lu, x);
1970 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1971 lu = rcpIter(lu, x);
1973 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1974 lu = rcpIter(lu, x);
1976 # if (GMX_SIMD_RCP_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
1977 lu = rcpIter(lu, x);
1983 /*! \brief Calculate sqrt(x) for SIMD doubles.
1985 * \copydetails sqrt(SimdFloat)
1987 template<MathOptimization opt = MathOptimization::Safe>
1988 static inline SimdDouble gmx_simdcall sqrt(SimdDouble x)
1990 if (opt == MathOptimization::Safe)
1992 // As we might use a float version of rsqrt, we mask out small values
1993 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
1998 return x * invsqrt(x);
2002 /*! \brief Cube root for SIMD doubles
2004 * \param x Argument to calculate cube root of. Can be negative or zero,
2005 * but NaN or Inf values are not supported. Denormal values will
2006 * be treated as 0.0.
2007 * \return Cube root of x.
2009 static inline SimdDouble gmx_simdcall cbrt(SimdDouble x)
2011 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2012 const SimdDouble minDouble(std::numeric_limits<double>::min());
2013 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2014 // To avoid clang warnings about fragile integer division mixed with FP, we let
2015 // the divided value (1023/3=341) be the original constant.
2016 const std::int32_t offsetDiv3(341);
2017 const SimdDouble c6(-0.145263899385486377);
2018 const SimdDouble c5(0.784932344976639262);
2019 const SimdDouble c4(-1.83469277483613086);
2020 const SimdDouble c3(2.44693122563534430);
2021 const SimdDouble c2(-2.11499494167371287);
2022 const SimdDouble c1(1.50819193781584896);
2023 const SimdDouble c0(0.354895765043919860);
2024 const SimdDouble one(1.0);
2025 const SimdDouble two(2.0);
2026 const SimdDouble three(3.0);
2027 const SimdDouble oneThird(1.0 / 3.0);
2028 const SimdDouble cbrt2(1.2599210498948731648);
2029 const SimdDouble sqrCbrt2(1.5874010519681994748);
2031 // See the single precision routines for documentation of the algorithm
2033 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2034 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2035 SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
2037 SimdDInt32 exponent;
2038 SimdDouble y = frexp(xAbs, &exponent);
2039 SimdDouble z = fma(y, c6, c5);
2045 SimdDouble w = z * z * z;
2046 SimdDouble nom = z * fma(two, y, w);
2047 SimdDouble invDenom = inv(fma(two, w, y));
2049 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2050 SimdDouble offsetExpDiv3 =
2051 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2052 SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
2053 SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
2054 SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
2055 factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
2056 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
2057 SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
2061 /*! \brief Inverse cube root for SIMD doubles.
2063 * \param x Argument to calculate cube root of. Can be positive or
2064 * negative, but the magnitude cannot be lower than
2065 * the smallest normal number.
2066 * \return Cube root of x. Undefined for values that don't
2067 * fulfill the restriction of abs(x) > minDouble.
2069 static inline SimdDouble gmx_simdcall invcbrt(SimdDouble x)
2071 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
2072 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
2073 // To avoid clang warnings about fragile integer division mixed with FP, we let
2074 // the divided value (1023/3=341) be the original constant.
2075 const std::int32_t offsetDiv3(341);
2076 const SimdDouble c6(-0.145263899385486377);
2077 const SimdDouble c5(0.784932344976639262);
2078 const SimdDouble c4(-1.83469277483613086);
2079 const SimdDouble c3(2.44693122563534430);
2080 const SimdDouble c2(-2.11499494167371287);
2081 const SimdDouble c1(1.50819193781584896);
2082 const SimdDouble c0(0.354895765043919860);
2083 const SimdDouble one(1.0);
2084 const SimdDouble two(2.0);
2085 const SimdDouble three(3.0);
2086 const SimdDouble oneThird(1.0 / 3.0);
2087 const SimdDouble invCbrt2(1.0 / 1.2599210498948731648);
2088 const SimdDouble invSqrCbrt2(1.0F / 1.5874010519681994748);
2090 // See the single precision routines for documentation of the algorithm
2092 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
2093 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
2095 SimdDInt32 exponent;
2096 SimdDouble y = frexp(xAbs, &exponent);
2097 SimdDouble z = fma(y, c6, c5);
2103 SimdDouble w = z * z * z;
2104 SimdDouble nom = fma(two, w, y);
2105 SimdDouble invDenom = inv(z * fma(two, y, w));
2106 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
2107 SimdDouble offsetExpDiv3 =
2108 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
2109 SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
2110 SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
2111 SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
2112 factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
2113 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
2114 SimdDouble result = ldexp(fraction, expDiv3);
2118 /*! \brief SIMD double log2(x). This is the base-2 logarithm.
2120 * \param x Argument, should be >0.
2121 * \result The base-2 logarithm of x. Undefined if argument is invalid.
2123 static inline SimdDouble gmx_simdcall log2(SimdDouble x)
2125 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2126 // Just rescale if native log2() is not present, but log is.
2127 return log(x) * SimdDouble(std::log2(std::exp(1.0)));
2129 const SimdDouble one(1.0);
2130 const SimdDouble two(2.0);
2131 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2132 const SimdDouble CL15(0.2138031565795550370534528);
2133 const SimdDouble CL13(0.2208884091496370882801159);
2134 const SimdDouble CL11(0.2623358279761824340958754);
2135 const SimdDouble CL9(0.3205984930182496084327681);
2136 const SimdDouble CL7(0.4121985864521960363227038);
2137 const SimdDouble CL5(0.5770780163410746954610886);
2138 const SimdDouble CL3(0.9617966939260027547931031);
2139 const SimdDouble CL1(2.885390081777926774009302);
2140 SimdDouble fExp, x2, p;
2144 x = frexp(x, &iExp);
2145 fExp = cvtI2R(iExp);
2148 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2149 fExp = fExp - selectByMask(one, m);
2150 x = x * blend(one, two, m);
2152 x = (x - one) * inv(x + one);
2155 p = fma(CL15, x2, CL13);
2156 p = fma(p, x2, CL11);
2157 p = fma(p, x2, CL9);
2158 p = fma(p, x2, CL7);
2159 p = fma(p, x2, CL5);
2160 p = fma(p, x2, CL3);
2161 p = fma(p, x2, CL1);
2162 p = fma(p, x, fExp);
2168 # if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
2169 /*! \brief SIMD double log(x). This is the natural logarithm.
2171 * \param x Argument, should be >0.
2172 * \result The natural logarithm of x. Undefined if argument is invalid.
2174 static inline SimdDouble gmx_simdcall log(SimdDouble x)
2176 const SimdDouble one(1.0);
2177 const SimdDouble two(2.0);
2178 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
2179 const SimdDouble corr(0.693147180559945286226764);
2180 const SimdDouble CL15(0.148197055177935105296783);
2181 const SimdDouble CL13(0.153108178020442575739679);
2182 const SimdDouble CL11(0.181837339521549679055568);
2183 const SimdDouble CL9(0.22222194152736701733275);
2184 const SimdDouble CL7(0.285714288030134544449368);
2185 const SimdDouble CL5(0.399999999989941956712869);
2186 const SimdDouble CL3(0.666666666666685503450651);
2187 const SimdDouble CL1(2.0);
2188 SimdDouble fExp, x2, p;
2192 x = frexp(x, &iExp);
2193 fExp = cvtI2R(iExp);
2196 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
2197 fExp = fExp - selectByMask(one, m);
2198 x = x * blend(one, two, m);
2200 x = (x - one) * inv(x + one);
2203 p = fma(CL15, x2, CL13);
2204 p = fma(p, x2, CL11);
2205 p = fma(p, x2, CL9);
2206 p = fma(p, x2, CL7);
2207 p = fma(p, x2, CL5);
2208 p = fma(p, x2, CL3);
2209 p = fma(p, x2, CL1);
2210 p = fma(p, x, corr * fExp);
2216 # if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
2217 /*! \brief SIMD double 2^x.
2219 * \copydetails exp2(SimdFloat)
2221 template<MathOptimization opt = MathOptimization::Safe>
2222 static inline SimdDouble gmx_simdcall exp2(SimdDouble x)
2224 const SimdDouble CE11(4.435280790452730022081181e-10);
2225 const SimdDouble CE10(7.074105630863314448024247e-09);
2226 const SimdDouble CE9(1.017819803432096698472621e-07);
2227 const SimdDouble CE8(1.321543308956718799557863e-06);
2228 const SimdDouble CE7(0.00001525273348995851746990884);
2229 const SimdDouble CE6(0.0001540353046251466849082632);
2230 const SimdDouble CE5(0.001333355814678995257307880);
2231 const SimdDouble CE4(0.009618129107588335039176502);
2232 const SimdDouble CE3(0.05550410866481992147457793);
2233 const SimdDouble CE2(0.2402265069591015620470894);
2234 const SimdDouble CE1(0.6931471805599453304615075);
2235 const SimdDouble one(1.0);
2238 SimdDouble fexppart;
2241 // Large negative values are valid arguments to exp2(), so there are two
2242 // things we need to account for:
2243 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2244 // zero and we can no longer multiply with it. There are special IEEE
2245 // formats to handle this range, but for now we have to accept that
2246 // we cannot handle those arguments. If input value becomes even more
2247 // negative, it will start to loop and we would end up with invalid
2248 // exponents. Thus, we need to limit or mask this.
2249 // 2. For VERY large negative values, we will have problems that the
2250 // subtraction to get the fractional part loses accuracy, and then we
2251 // can end up with overflows in the polynomial.
2253 // For now, we handle this by forwarding the math optimization setting to
2254 // ldexp, where the routine will return zero for very small arguments.
2256 // However, before doing that we need to make sure we do not call cvtR2I
2257 // with an argument that is so negative it cannot be converted to an integer.
2258 if (opt == MathOptimization::Safe)
2260 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
2263 fexppart = ldexp<opt>(one, cvtR2I(x));
2267 p = fma(CE11, x, CE10);
2283 # if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
2284 /*! \brief SIMD double exp(x).
2286 * \copydetails exp(SimdFloat)
2288 template<MathOptimization opt = MathOptimization::Safe>
2289 static inline SimdDouble gmx_simdcall exp(SimdDouble x)
2291 const SimdDouble argscale(1.44269504088896340735992468100);
2292 const SimdDouble invargscale0(-0.69314718055966295651160180568695068359375);
2293 const SimdDouble invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
2294 const SimdDouble CE12(2.078375306791423699350304e-09);
2295 const SimdDouble CE11(2.518173854179933105218635e-08);
2296 const SimdDouble CE10(2.755842049600488770111608e-07);
2297 const SimdDouble CE9(2.755691815216689746619849e-06);
2298 const SimdDouble CE8(2.480158383706245033920920e-05);
2299 const SimdDouble CE7(0.0001984127043518048611841321);
2300 const SimdDouble CE6(0.001388888889360258341755930);
2301 const SimdDouble CE5(0.008333333332907368102819109);
2302 const SimdDouble CE4(0.04166666666663836745814631);
2303 const SimdDouble CE3(0.1666666666666796929434570);
2304 const SimdDouble CE2(0.5);
2305 const SimdDouble one(1.0);
2306 SimdDouble fexppart;
2310 // Large negative values are valid arguments to exp2(), so there are two
2311 // things we need to account for:
2312 // 1. When the exponents reaches -1023, the (biased) exponent field will be
2313 // zero and we can no longer multiply with it. There are special IEEE
2314 // formats to handle this range, but for now we have to accept that
2315 // we cannot handle those arguments. If input value becomes even more
2316 // negative, it will start to loop and we would end up with invalid
2317 // exponents. Thus, we need to limit or mask this.
2318 // 2. For VERY large negative values, we will have problems that the
2319 // subtraction to get the fractional part loses accuracy, and then we
2320 // can end up with overflows in the polynomial.
2322 // For now, we handle this by forwarding the math optimization setting to
2323 // ldexp, where the routine will return zero for very small arguments.
2325 // However, before doing that we need to make sure we do not call cvtR2I
2326 // with an argument that is so negative it cannot be converted to an integer
2327 // after being multiplied by argscale.
2329 if (opt == MathOptimization::Safe)
2331 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
2336 fexppart = ldexp<opt>(one, cvtR2I(y));
2339 // Extended precision arithmetics
2340 x = fma(invargscale0, intpart, x);
2341 x = fma(invargscale1, intpart, x);
2343 p = fma(CE12, x, CE11);
2344 p = fma(p, x, CE10);
2353 p = fma(p, x * x, x);
2354 # if GMX_SIMD_HAVE_FMA
2355 x = fma(p, fexppart, fexppart);
2357 x = (p + one) * fexppart;
2364 /*! \brief SIMD double pow(x,y)
2366 * This returns x^y for SIMD values.
2368 * \tparam opt If this is changed from the default (safe) into the unsafe
2369 * option, there are no guarantees about correct results for x==0.
2373 * \param y exponent.
2375 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
2376 * depending on the underlying implementation. If unsafe optimizations
2377 * are enabled, this is also true for x==0.
2379 * \warning You cannot rely on this implementation returning inf for arguments
2380 * that cause overflow. If you have some very large
2381 * values and need to rely on getting a valid numerical output,
2382 * take the minimum of your variable and the largest valid argument
2383 * before calling this routine.
2385 template<MathOptimization opt = MathOptimization::Safe>
2386 static inline SimdDouble gmx_simdcall pow(SimdDouble x, SimdDouble y)
2390 if (opt == MathOptimization::Safe)
2392 xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
2399 SimdDouble result = exp2<opt>(y * log2(xcorr));
2401 if (opt == MathOptimization::Safe)
2403 // if x==0 and y>0 we explicitly set the result to 0.0
2404 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
2405 result = blend(result, setZero(), x == setZero() && setZero() < y);
2412 /*! \brief SIMD double erf(x).
2414 * \param x The value to calculate erf(x) for.
2417 * This routine achieves very close to full precision, but we do not care about
2418 * the last bit or the subnormal result range.
2420 static inline SimdDouble gmx_simdcall erf(SimdDouble x)
2422 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2423 const SimdDouble CAP4(-0.431780540597889301512e-4);
2424 const SimdDouble CAP3(-0.00578562306260059236059);
2425 const SimdDouble CAP2(-0.028593586920219752446);
2426 const SimdDouble CAP1(-0.315924962948621698209);
2427 const SimdDouble CAP0(0.14952975608477029151);
2429 const SimdDouble CAQ5(-0.374089300177174709737e-5);
2430 const SimdDouble CAQ4(0.00015126584532155383535);
2431 const SimdDouble CAQ3(0.00536692680669480725423);
2432 const SimdDouble CAQ2(0.0668686825594046122636);
2433 const SimdDouble CAQ1(0.402604990869284362773);
2435 const SimdDouble CAoffset(0.9788494110107421875);
2437 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2438 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2439 const SimdDouble CBP5(0.00119770193298159629350136085658);
2440 const SimdDouble CBP4(0.0164944422378370965881008942733);
2441 const SimdDouble CBP3(0.0984581468691775932063932439252);
2442 const SimdDouble CBP2(0.317364595806937763843589437418);
2443 const SimdDouble CBP1(0.554167062641455850932670067075);
2444 const SimdDouble CBP0(0.427583576155807163756925301060);
2445 const SimdDouble CBQ7(0.00212288829699830145976198384930);
2446 const SimdDouble CBQ6(0.0334810979522685300554606393425);
2447 const SimdDouble CBQ5(0.2361713785181450957579508850717);
2448 const SimdDouble CBQ4(0.955364736493055670530981883072);
2449 const SimdDouble CBQ3(2.36815675631420037315349279199);
2450 const SimdDouble CBQ2(3.55261649184083035537184223542);
2451 const SimdDouble CBQ1(2.93501136050160872574376997993);
2454 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2455 const SimdDouble CCP6(-2.8175401114513378771);
2456 const SimdDouble CCP5(-3.22729451764143718517);
2457 const SimdDouble CCP4(-2.5518551727311523996);
2458 const SimdDouble CCP3(-0.687717681153649930619);
2459 const SimdDouble CCP2(-0.212652252872804219852);
2460 const SimdDouble CCP1(0.0175389834052493308818);
2461 const SimdDouble CCP0(0.00628057170626964891937);
2463 const SimdDouble CCQ6(5.48409182238641741584);
2464 const SimdDouble CCQ5(13.5064170191802889145);
2465 const SimdDouble CCQ4(22.9367376522880577224);
2466 const SimdDouble CCQ3(15.930646027911794143);
2467 const SimdDouble CCQ2(11.0567237927800161565);
2468 const SimdDouble CCQ1(2.79257750980575282228);
2470 const SimdDouble CCoffset(0.5579090118408203125);
2472 const SimdDouble one(1.0);
2473 const SimdDouble two(2.0);
2474 const SimdDouble minFloat(std::numeric_limits<float>::min());
2476 SimdDouble xabs, x2, x4, t, t2, w, w2;
2477 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2478 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2479 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2480 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2482 SimdDBool mask, mask_erf, notmask_erf;
2486 mask_erf = (xabs < one);
2487 notmask_erf = (one <= xabs);
2491 PolyAP0 = fma(CAP4, x4, CAP2);
2492 PolyAP1 = fma(CAP3, x4, CAP1);
2493 PolyAP0 = fma(PolyAP0, x4, CAP0);
2494 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2496 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2497 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2498 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2499 PolyAQ0 = fma(PolyAQ0, x4, one);
2500 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2502 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2503 res_erf = CAoffset + res_erf;
2504 res_erf = x * res_erf;
2506 // Calculate erfc() in range [1,4.5]
2510 PolyBP0 = fma(CBP6, t2, CBP4);
2511 PolyBP1 = fma(CBP5, t2, CBP3);
2512 PolyBP0 = fma(PolyBP0, t2, CBP2);
2513 PolyBP1 = fma(PolyBP1, t2, CBP1);
2514 PolyBP0 = fma(PolyBP0, t2, CBP0);
2515 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2517 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2518 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2519 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2520 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2521 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2522 PolyBQ0 = fma(PolyBQ0, t2, one);
2523 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2525 // The denominator polynomial can be zero outside the range
2526 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2528 res_erfcB = res_erfcB * xabs;
2530 // Calculate erfc() in range [4.5,inf]
2531 w = maskzInv(xabs, notmask_erf && (minFloat <= xabs));
2534 PolyCP0 = fma(CCP6, w2, CCP4);
2535 PolyCP1 = fma(CCP5, w2, CCP3);
2536 PolyCP0 = fma(PolyCP0, w2, CCP2);
2537 PolyCP1 = fma(PolyCP1, w2, CCP1);
2538 PolyCP0 = fma(PolyCP0, w2, CCP0);
2539 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2541 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2542 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2543 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2544 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2545 PolyCQ0 = fma(PolyCQ0, w2, one);
2546 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2550 // The denominator polynomial can be zero outside the range
2551 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2552 res_erfcC = res_erfcC + CCoffset;
2553 res_erfcC = res_erfcC * w;
2555 mask = (SimdDouble(4.5) < xabs);
2556 res_erfc = blend(res_erfcB, res_erfcC, mask);
2558 res_erfc = res_erfc * expmx2;
2560 // erfc(x<0) = 2-erfc(|x|)
2561 mask = (x < setZero());
2562 res_erfc = blend(res_erfc, two - res_erfc, mask);
2564 // Select erf() or erfc()
2565 res = blend(one - res_erfc, res_erf, mask_erf);
2570 /*! \brief SIMD double erfc(x).
2572 * \param x The value to calculate erfc(x) for.
2575 * This routine achieves full precision (bar the last bit) over most of the
2576 * input range, but for large arguments where the result is getting close
2577 * to the minimum representable numbers we accept slightly larger errors
2578 * (think results that are in the ballpark of 10^-200 for double)
2579 * since that is not relevant for MD.
2581 static inline SimdDouble gmx_simdcall erfc(SimdDouble x)
2583 // Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75]
2584 const SimdDouble CAP4(-0.431780540597889301512e-4);
2585 const SimdDouble CAP3(-0.00578562306260059236059);
2586 const SimdDouble CAP2(-0.028593586920219752446);
2587 const SimdDouble CAP1(-0.315924962948621698209);
2588 const SimdDouble CAP0(0.14952975608477029151);
2590 const SimdDouble CAQ5(-0.374089300177174709737e-5);
2591 const SimdDouble CAQ4(0.00015126584532155383535);
2592 const SimdDouble CAQ3(0.00536692680669480725423);
2593 const SimdDouble CAQ2(0.0668686825594046122636);
2594 const SimdDouble CAQ1(0.402604990869284362773);
2596 const SimdDouble CAoffset(0.9788494110107421875);
2598 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5]
2599 const SimdDouble CBP6(2.49650423685462752497647637088e-10);
2600 const SimdDouble CBP5(0.00119770193298159629350136085658);
2601 const SimdDouble CBP4(0.0164944422378370965881008942733);
2602 const SimdDouble CBP3(0.0984581468691775932063932439252);
2603 const SimdDouble CBP2(0.317364595806937763843589437418);
2604 const SimdDouble CBP1(0.554167062641455850932670067075);
2605 const SimdDouble CBP0(0.427583576155807163756925301060);
2606 const SimdDouble CBQ7(0.00212288829699830145976198384930);
2607 const SimdDouble CBQ6(0.0334810979522685300554606393425);
2608 const SimdDouble CBQ5(0.2361713785181450957579508850717);
2609 const SimdDouble CBQ4(0.955364736493055670530981883072);
2610 const SimdDouble CBQ3(2.36815675631420037315349279199);
2611 const SimdDouble CBQ2(3.55261649184083035537184223542);
2612 const SimdDouble CBQ1(2.93501136050160872574376997993);
2615 // Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf]
2616 const SimdDouble CCP6(-2.8175401114513378771);
2617 const SimdDouble CCP5(-3.22729451764143718517);
2618 const SimdDouble CCP4(-2.5518551727311523996);
2619 const SimdDouble CCP3(-0.687717681153649930619);
2620 const SimdDouble CCP2(-0.212652252872804219852);
2621 const SimdDouble CCP1(0.0175389834052493308818);
2622 const SimdDouble CCP0(0.00628057170626964891937);
2624 const SimdDouble CCQ6(5.48409182238641741584);
2625 const SimdDouble CCQ5(13.5064170191802889145);
2626 const SimdDouble CCQ4(22.9367376522880577224);
2627 const SimdDouble CCQ3(15.930646027911794143);
2628 const SimdDouble CCQ2(11.0567237927800161565);
2629 const SimdDouble CCQ1(2.79257750980575282228);
2631 const SimdDouble CCoffset(0.5579090118408203125);
2633 const SimdDouble one(1.0);
2634 const SimdDouble two(2.0);
2635 const SimdDouble minFloat(std::numeric_limits<float>::min());
2637 SimdDouble xabs, x2, x4, t, t2, w, w2;
2638 SimdDouble PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2639 SimdDouble PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2640 SimdDouble PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2641 SimdDouble res_erf, res_erfcB, res_erfcC, res_erfc, res;
2643 SimdDBool mask, mask_erf, notmask_erf;
2647 mask_erf = (xabs < one);
2648 notmask_erf = (one <= xabs);
2652 PolyAP0 = fma(CAP4, x4, CAP2);
2653 PolyAP1 = fma(CAP3, x4, CAP1);
2654 PolyAP0 = fma(PolyAP0, x4, CAP0);
2655 PolyAP0 = fma(PolyAP1, x2, PolyAP0);
2656 PolyAQ1 = fma(CAQ5, x4, CAQ3);
2657 PolyAQ0 = fma(CAQ4, x4, CAQ2);
2658 PolyAQ1 = fma(PolyAQ1, x4, CAQ1);
2659 PolyAQ0 = fma(PolyAQ0, x4, one);
2660 PolyAQ0 = fma(PolyAQ1, x2, PolyAQ0);
2662 res_erf = PolyAP0 * maskzInv(PolyAQ0, mask_erf && (minFloat <= abs(PolyAQ0)));
2663 res_erf = CAoffset + res_erf;
2664 res_erf = x * res_erf;
2666 // Calculate erfc() in range [1,4.5]
2670 PolyBP0 = fma(CBP6, t2, CBP4);
2671 PolyBP1 = fma(CBP5, t2, CBP3);
2672 PolyBP0 = fma(PolyBP0, t2, CBP2);
2673 PolyBP1 = fma(PolyBP1, t2, CBP1);
2674 PolyBP0 = fma(PolyBP0, t2, CBP0);
2675 PolyBP0 = fma(PolyBP1, t, PolyBP0);
2677 PolyBQ1 = fma(CBQ7, t2, CBQ5);
2678 PolyBQ0 = fma(CBQ6, t2, CBQ4);
2679 PolyBQ1 = fma(PolyBQ1, t2, CBQ3);
2680 PolyBQ0 = fma(PolyBQ0, t2, CBQ2);
2681 PolyBQ1 = fma(PolyBQ1, t2, CBQ1);
2682 PolyBQ0 = fma(PolyBQ0, t2, one);
2683 PolyBQ0 = fma(PolyBQ1, t, PolyBQ0);
2685 // The denominator polynomial can be zero outside the range
2686 res_erfcB = PolyBP0 * maskzInv(PolyBQ0, notmask_erf && (minFloat <= abs(PolyBQ0)));
2688 res_erfcB = res_erfcB * xabs;
2690 // Calculate erfc() in range [4.5,inf]
2691 // Note that 1/x can only handle single precision!
2692 w = maskzInv(xabs, minFloat <= xabs);
2695 PolyCP0 = fma(CCP6, w2, CCP4);
2696 PolyCP1 = fma(CCP5, w2, CCP3);
2697 PolyCP0 = fma(PolyCP0, w2, CCP2);
2698 PolyCP1 = fma(PolyCP1, w2, CCP1);
2699 PolyCP0 = fma(PolyCP0, w2, CCP0);
2700 PolyCP0 = fma(PolyCP1, w, PolyCP0);
2702 PolyCQ0 = fma(CCQ6, w2, CCQ4);
2703 PolyCQ1 = fma(CCQ5, w2, CCQ3);
2704 PolyCQ0 = fma(PolyCQ0, w2, CCQ2);
2705 PolyCQ1 = fma(PolyCQ1, w2, CCQ1);
2706 PolyCQ0 = fma(PolyCQ0, w2, one);
2707 PolyCQ0 = fma(PolyCQ1, w, PolyCQ0);
2711 // The denominator polynomial can be zero outside the range
2712 res_erfcC = PolyCP0 * maskzInv(PolyCQ0, notmask_erf && (minFloat <= abs(PolyCQ0)));
2713 res_erfcC = res_erfcC + CCoffset;
2714 res_erfcC = res_erfcC * w;
2716 mask = (SimdDouble(4.5) < xabs);
2717 res_erfc = blend(res_erfcB, res_erfcC, mask);
2719 res_erfc = res_erfc * expmx2;
2721 // erfc(x<0) = 2-erfc(|x|)
2722 mask = (x < setZero());
2723 res_erfc = blend(res_erfc, two - res_erfc, mask);
2725 // Select erf() or erfc()
2726 res = blend(res_erfc, one - res_erf, mask_erf);
2731 /*! \brief SIMD double sin \& cos.
2733 * \param x The argument to evaluate sin/cos for
2734 * \param[out] sinval Sin(x)
2735 * \param[out] cosval Cos(x)
2737 * This version achieves close to machine precision, but for very large
2738 * magnitudes of the argument we inherently begin to lose accuracy due to the
2739 * argument reduction, despite using extended precision arithmetics internally.
2741 static inline void gmx_simdcall sincos(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
2743 // Constants to subtract Pi/4*x from y while minimizing precision loss
2744 const SimdDouble argred0(-2 * 0.78539816290140151978);
2745 const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2746 const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2747 const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2748 const SimdDouble two_over_pi(2.0 / M_PI);
2749 const SimdDouble const_sin5(1.58938307283228937328511e-10);
2750 const SimdDouble const_sin4(-2.50506943502539773349318e-08);
2751 const SimdDouble const_sin3(2.75573131776846360512547e-06);
2752 const SimdDouble const_sin2(-0.000198412698278911770864914);
2753 const SimdDouble const_sin1(0.0083333333333191845961746);
2754 const SimdDouble const_sin0(-0.166666666666666130709393);
2756 const SimdDouble const_cos7(-1.13615350239097429531523e-11);
2757 const SimdDouble const_cos6(2.08757471207040055479366e-09);
2758 const SimdDouble const_cos5(-2.75573144028847567498567e-07);
2759 const SimdDouble const_cos4(2.48015872890001867311915e-05);
2760 const SimdDouble const_cos3(-0.00138888888888714019282329);
2761 const SimdDouble const_cos2(0.0416666666666665519592062);
2762 const SimdDouble half(0.5);
2763 const SimdDouble one(1.0);
2764 SimdDouble ssign, csign;
2765 SimdDouble x2, y, z, psin, pcos, sss, ccc;
2767 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2768 const SimdDInt32 ione(1);
2769 const SimdDInt32 itwo(2);
2772 z = x * two_over_pi;
2776 mask = cvtIB2B((iy & ione) == setZero());
2777 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
2778 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
2780 const SimdDouble quarter(0.25);
2781 const SimdDouble minusquarter(-0.25);
2783 SimdDBool m1, m2, m3;
2785 /* The most obvious way to find the arguments quadrant in the unit circle
2786 * to calculate the sign is to use integer arithmetic, but that is not
2787 * present in all SIMD implementations. As an alternative, we have devised a
2788 * pure floating-point algorithm that uses truncation for argument reduction
2789 * so that we get a new value 0<=q<1 over the unit circle, and then
2790 * do floating-point comparisons with fractions. This is likely to be
2791 * slightly slower (~10%) due to the longer latencies of floating-point, so
2792 * we only use it when integer SIMD arithmetic is not present.
2796 // It is critical that half-way cases are rounded down
2797 z = fma(x, two_over_pi, half);
2801 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2802 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2803 * This removes the 2*Pi periodicity without using any integer arithmetic.
2804 * First check if y had the value 2 or 3, set csign if true.
2807 /* If we have logical operations we can work directly on the signbit, which
2808 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2809 * Thus, if you are altering defines to debug alternative code paths, the
2810 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2811 * active or inactive - you will get errors if only one is used.
2813 # if GMX_SIMD_HAVE_LOGICAL
2814 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
2815 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
2816 ssign = ssign ^ csign;
2818 ssign = copysign(SimdDouble(1.0), ssign);
2819 csign = copysign(SimdDouble(1.0), q);
2821 ssign = ssign * csign; // swap ssign if csign was set.
2823 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
2824 m1 = (q < minusquarter);
2825 m2 = (setZero() <= q);
2829 // where mask is FALSE, swap sign.
2830 csign = csign * blend(SimdDouble(-1.0), one, mask);
2832 x = fma(y, argred0, x);
2833 x = fma(y, argred1, x);
2834 x = fma(y, argred2, x);
2835 x = fma(y, argred3, x);
2838 psin = fma(const_sin5, x2, const_sin4);
2839 psin = fma(psin, x2, const_sin3);
2840 psin = fma(psin, x2, const_sin2);
2841 psin = fma(psin, x2, const_sin1);
2842 psin = fma(psin, x2, const_sin0);
2843 psin = fma(psin, x2 * x, x);
2845 pcos = fma(const_cos7, x2, const_cos6);
2846 pcos = fma(pcos, x2, const_cos5);
2847 pcos = fma(pcos, x2, const_cos4);
2848 pcos = fma(pcos, x2, const_cos3);
2849 pcos = fma(pcos, x2, const_cos2);
2850 pcos = fms(pcos, x2, half);
2851 pcos = fma(pcos, x2, one);
2853 sss = blend(pcos, psin, mask);
2854 ccc = blend(psin, pcos, mask);
2855 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
2856 # if GMX_SIMD_HAVE_LOGICAL
2857 *sinval = sss ^ ssign;
2858 *cosval = ccc ^ csign;
2860 *sinval = sss * ssign;
2861 *cosval = ccc * csign;
2865 /*! \brief SIMD double sin(x).
2867 * \param x The argument to evaluate sin for
2870 * \attention Do NOT call both sin & cos if you need both results, since each of them
2871 * will then call \ref sincos and waste a factor 2 in performance.
2873 static inline SimdDouble gmx_simdcall sin(SimdDouble x)
2880 /*! \brief SIMD double cos(x).
2882 * \param x The argument to evaluate cos for
2885 * \attention Do NOT call both sin & cos if you need both results, since each of them
2886 * will then call \ref sincos and waste a factor 2 in performance.
2888 static inline SimdDouble gmx_simdcall cos(SimdDouble x)
2895 /*! \brief SIMD double tan(x).
2897 * \param x The argument to evaluate tan for
2900 static inline SimdDouble gmx_simdcall tan(SimdDouble x)
2902 const SimdDouble argred0(-2 * 0.78539816290140151978);
2903 const SimdDouble argred1(-2 * 4.9604678871439933374e-10);
2904 const SimdDouble argred2(-2 * 1.1258708853173288931e-18);
2905 const SimdDouble argred3(-2 * 1.7607799325916000908e-27);
2906 const SimdDouble two_over_pi(2.0 / M_PI);
2907 const SimdDouble CT15(1.01419718511083373224408e-05);
2908 const SimdDouble CT14(-2.59519791585924697698614e-05);
2909 const SimdDouble CT13(5.23388081915899855325186e-05);
2910 const SimdDouble CT12(-3.05033014433946488225616e-05);
2911 const SimdDouble CT11(7.14707504084242744267497e-05);
2912 const SimdDouble CT10(8.09674518280159187045078e-05);
2913 const SimdDouble CT9(0.000244884931879331847054404);
2914 const SimdDouble CT8(0.000588505168743587154904506);
2915 const SimdDouble CT7(0.00145612788922812427978848);
2916 const SimdDouble CT6(0.00359208743836906619142924);
2917 const SimdDouble CT5(0.00886323944362401618113356);
2918 const SimdDouble CT4(0.0218694882853846389592078);
2919 const SimdDouble CT3(0.0539682539781298417636002);
2920 const SimdDouble CT2(0.133333333333125941821962);
2921 const SimdDouble CT1(0.333333333333334980164153);
2922 const SimdDouble minFloat(std::numeric_limits<float>::min());
2924 SimdDouble x2, p, y, z;
2927 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
2931 z = x * two_over_pi;
2934 m = cvtIB2B((iy & ione) == ione);
2936 x = fma(y, argred0, x);
2937 x = fma(y, argred1, x);
2938 x = fma(y, argred2, x);
2939 x = fma(y, argred3, x);
2940 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), m) ^ x;
2942 const SimdDouble quarter(0.25);
2943 const SimdDouble half(0.5);
2944 const SimdDouble threequarter(0.75);
2945 const SimdDouble minFloat(std::numeric_limits<float>::min());
2947 SimdDBool m1, m2, m3;
2950 z = fma(w, two_over_pi, half);
2954 m1 = (quarter <= q);
2956 m3 = (threequarter <= q);
2959 w = fma(y, argred0, w);
2960 w = fma(y, argred1, w);
2961 w = fma(y, argred2, w);
2962 w = fma(y, argred3, w);
2964 w = blend(w, -w, m);
2965 x = w * copysign(SimdDouble(1.0), x);
2968 p = fma(CT15, x2, CT14);
2969 p = fma(p, x2, CT13);
2970 p = fma(p, x2, CT12);
2971 p = fma(p, x2, CT11);
2972 p = fma(p, x2, CT10);
2973 p = fma(p, x2, CT9);
2974 p = fma(p, x2, CT8);
2975 p = fma(p, x2, CT7);
2976 p = fma(p, x2, CT6);
2977 p = fma(p, x2, CT5);
2978 p = fma(p, x2, CT4);
2979 p = fma(p, x2, CT3);
2980 p = fma(p, x2, CT2);
2981 p = fma(p, x2, CT1);
2982 p = fma(x2, p * x, x);
2984 p = blend(p, maskzInv(p, m && (minFloat < abs(p))), m);
2988 /*! \brief SIMD double asin(x).
2990 * \param x The argument to evaluate asin for
2993 static inline SimdDouble gmx_simdcall asin(SimdDouble x)
2995 // Same algorithm as cephes library
2996 const SimdDouble limit1(0.625);
2997 const SimdDouble limit2(1e-8);
2998 const SimdDouble one(1.0);
2999 const SimdDouble quarterpi(M_PI / 4.0);
3000 const SimdDouble morebits(6.123233995736765886130e-17);
3002 const SimdDouble P5(4.253011369004428248960e-3);
3003 const SimdDouble P4(-6.019598008014123785661e-1);
3004 const SimdDouble P3(5.444622390564711410273e0);
3005 const SimdDouble P2(-1.626247967210700244449e1);
3006 const SimdDouble P1(1.956261983317594739197e1);
3007 const SimdDouble P0(-8.198089802484824371615e0);
3009 const SimdDouble Q4(-1.474091372988853791896e1);
3010 const SimdDouble Q3(7.049610280856842141659e1);
3011 const SimdDouble Q2(-1.471791292232726029859e2);
3012 const SimdDouble Q1(1.395105614657485689735e2);
3013 const SimdDouble Q0(-4.918853881490881290097e1);
3015 const SimdDouble R4(2.967721961301243206100e-3);
3016 const SimdDouble R3(-5.634242780008963776856e-1);
3017 const SimdDouble R2(6.968710824104713396794e0);
3018 const SimdDouble R1(-2.556901049652824852289e1);
3019 const SimdDouble R0(2.853665548261061424989e1);
3021 const SimdDouble S3(-2.194779531642920639778e1);
3022 const SimdDouble S2(1.470656354026814941758e2);
3023 const SimdDouble S1(-3.838770957603691357202e2);
3024 const SimdDouble S0(3.424398657913078477438e2);
3027 SimdDouble zz, ww, z, q, w, zz2, ww2;
3032 SimdDouble nom, denom;
3033 SimdDBool mask, mask2;
3037 mask = (limit1 < xabs);
3045 RA = fma(R4, zz2, R2);
3046 RB = fma(R3, zz2, R1);
3047 RA = fma(RA, zz2, R0);
3048 RA = fma(RB, zz, RA);
3051 SB = fma(S3, zz2, S1);
3053 SA = fma(SA, zz2, S0);
3054 SA = fma(SB, zz, SA);
3057 PA = fma(P5, ww2, P3);
3058 PB = fma(P4, ww2, P2);
3059 PA = fma(PA, ww2, P1);
3060 PB = fma(PB, ww2, P0);
3061 PA = fma(PA, ww, PB);
3064 QB = fma(Q4, ww2, Q2);
3066 QA = fma(QA, ww2, Q1);
3067 QB = fma(QB, ww2, Q0);
3068 QA = fma(QA, ww, QB);
3073 nom = blend(PA, RA, mask);
3074 denom = blend(QA, SA, mask);
3076 mask2 = (limit2 < xabs);
3077 q = nom * maskzInv(denom, mask2);
3082 zz = fms(zz, q, morebits);
3089 z = blend(w, z, mask);
3091 z = blend(xabs, z, mask2);
3098 /*! \brief SIMD double acos(x).
3100 * \param x The argument to evaluate acos for
3103 static inline SimdDouble gmx_simdcall acos(SimdDouble x)
3105 const SimdDouble one(1.0);
3106 const SimdDouble half(0.5);
3107 const SimdDouble quarterpi0(7.85398163397448309616e-1);
3108 const SimdDouble quarterpi1(6.123233995736765886130e-17);
3111 SimdDouble z, z1, z2;
3114 z1 = half * (one - x);
3116 z = blend(x, z1, mask1);
3122 z2 = quarterpi0 - z;
3123 z2 = z2 + quarterpi1;
3124 z2 = z2 + quarterpi0;
3126 z = blend(z2, z1, mask1);
3131 /*! \brief SIMD double asin(x).
3133 * \param x The argument to evaluate atan for
3134 * \result Atan(x), same argument/value range as standard math library.
3136 static inline SimdDouble gmx_simdcall atan(SimdDouble x)
3138 // Same algorithm as cephes library
3139 const SimdDouble limit1(0.66);
3140 const SimdDouble limit2(2.41421356237309504880);
3141 const SimdDouble quarterpi(M_PI / 4.0);
3142 const SimdDouble halfpi(M_PI / 2.0);
3143 const SimdDouble mone(-1.0);
3144 const SimdDouble morebits1(0.5 * 6.123233995736765886130E-17);
3145 const SimdDouble morebits2(6.123233995736765886130E-17);
3147 const SimdDouble P4(-8.750608600031904122785E-1);
3148 const SimdDouble P3(-1.615753718733365076637E1);
3149 const SimdDouble P2(-7.500855792314704667340E1);
3150 const SimdDouble P1(-1.228866684490136173410E2);
3151 const SimdDouble P0(-6.485021904942025371773E1);
3153 const SimdDouble Q4(2.485846490142306297962E1);
3154 const SimdDouble Q3(1.650270098316988542046E2);
3155 const SimdDouble Q2(4.328810604912902668951E2);
3156 const SimdDouble Q1(4.853903996359136964868E2);
3157 const SimdDouble Q0(1.945506571482613964425E2);
3159 SimdDouble y, xabs, t1, t2;
3161 SimdDouble P_A, P_B, Q_A, Q_B;
3162 SimdDBool mask1, mask2;
3166 mask1 = (limit1 < xabs);
3167 mask2 = (limit2 < xabs);
3169 t1 = (xabs + mone) * maskzInv(xabs - mone, mask1);
3170 t2 = mone * maskzInv(xabs, mask2);
3172 y = selectByMask(quarterpi, mask1);
3173 y = blend(y, halfpi, mask2);
3174 xabs = blend(xabs, t1, mask1);
3175 xabs = blend(xabs, t2, mask2);
3180 P_A = fma(P4, z2, P2);
3181 P_B = fma(P3, z2, P1);
3182 P_A = fma(P_A, z2, P0);
3183 P_A = fma(P_B, z, P_A);
3186 Q_B = fma(Q4, z2, Q2);
3188 Q_A = fma(Q_A, z2, Q1);
3189 Q_B = fma(Q_B, z2, Q0);
3190 Q_A = fma(Q_A, z, Q_B);
3194 z = fma(z, xabs, xabs);
3196 t1 = selectByMask(morebits1, mask1);
3197 t1 = blend(t1, morebits2, mask2);
3207 /*! \brief SIMD double atan2(y,x).
3209 * \param y Y component of vector, any quartile
3210 * \param x X component of vector, any quartile
3211 * \result Atan(y,x), same argument/value range as standard math library.
3213 * \note This routine should provide correct results for all finite
3214 * non-zero or positive-zero arguments. However, negative zero arguments will
3215 * be treated as positive zero, which means the return value will deviate from
3216 * the standard math library atan2(y,x) for those cases. That should not be
3217 * of any concern in Gromacs, and in particular it will not affect calculations
3218 * of angles from vectors.
3220 static inline SimdDouble gmx_simdcall atan2(SimdDouble y, SimdDouble x)
3222 const SimdDouble pi(M_PI);
3223 const SimdDouble halfpi(M_PI / 2.0);
3224 const SimdDouble minFloat(std::numeric_limits<float>::min());
3225 SimdDouble xinv, p, aoffset;
3226 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
3228 mask_xnz = x != setZero();
3229 mask_ynz = y != setZero();
3230 mask_xlt0 = (x < setZero());
3231 mask_ylt0 = (y < setZero());
3233 aoffset = selectByNotMask(halfpi, mask_xnz);
3234 aoffset = selectByMask(aoffset, mask_ynz);
3236 aoffset = blend(aoffset, pi, mask_xlt0);
3237 aoffset = blend(aoffset, -aoffset, mask_ylt0);
3239 xinv = maskzInv(x, mask_xnz && (minFloat <= abs(x)));
3248 /*! \brief Calculate the force correction due to PME analytically in SIMD double.
3250 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3251 * interaction distance and beta the ewald splitting parameters.
3252 * \result Correction factor to coulomb force.
3254 * This routine is meant to enable analytical evaluation of the
3255 * direct-space PME electrostatic force to avoid tables. For details, see the
3256 * single precision function.
3258 static inline SimdDouble gmx_simdcall pmeForceCorrection(SimdDouble z2)
3260 const SimdDouble FN10(-8.0072854618360083154e-14);
3261 const SimdDouble FN9(1.1859116242260148027e-11);
3262 const SimdDouble FN8(-8.1490406329798423616e-10);
3263 const SimdDouble FN7(3.4404793543907847655e-8);
3264 const SimdDouble FN6(-9.9471420832602741006e-7);
3265 const SimdDouble FN5(0.000020740315999115847456);
3266 const SimdDouble FN4(-0.00031991745139313364005);
3267 const SimdDouble FN3(0.0035074449373659008203);
3268 const SimdDouble FN2(-0.031750380176100813405);
3269 const SimdDouble FN1(0.13884101728898463426);
3270 const SimdDouble FN0(-0.75225277815249618847);
3272 const SimdDouble FD5(0.000016009278224355026701);
3273 const SimdDouble FD4(0.00051055686934806966046);
3274 const SimdDouble FD3(0.0081803507497974289008);
3275 const SimdDouble FD2(0.077181146026670287235);
3276 const SimdDouble FD1(0.41543303143712535988);
3277 const SimdDouble FD0(1.0);
3280 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
3284 polyFD1 = fma(FD5, z4, FD3);
3285 polyFD1 = fma(polyFD1, z4, FD1);
3286 polyFD1 = polyFD1 * z2;
3287 polyFD0 = fma(FD4, z4, FD2);
3288 polyFD0 = fma(polyFD0, z4, FD0);
3289 polyFD0 = polyFD0 + polyFD1;
3291 polyFD0 = inv(polyFD0);
3293 polyFN0 = fma(FN10, z4, FN8);
3294 polyFN0 = fma(polyFN0, z4, FN6);
3295 polyFN0 = fma(polyFN0, z4, FN4);
3296 polyFN0 = fma(polyFN0, z4, FN2);
3297 polyFN0 = fma(polyFN0, z4, FN0);
3298 polyFN1 = fma(FN9, z4, FN7);
3299 polyFN1 = fma(polyFN1, z4, FN5);
3300 polyFN1 = fma(polyFN1, z4, FN3);
3301 polyFN1 = fma(polyFN1, z4, FN1);
3302 polyFN0 = fma(polyFN1, z2, polyFN0);
3305 return polyFN0 * polyFD0;
3309 /*! \brief Calculate the potential correction due to PME analytically in SIMD double.
3311 * \param z2 This should be the value \f$(r \beta)^2\f$, where r is your
3312 * interaction distance and beta the ewald splitting parameters.
3313 * \result Correction factor to coulomb force.
3315 * This routine is meant to enable analytical evaluation of the
3316 * direct-space PME electrostatic potential to avoid tables. For details, see the
3317 * single precision function.
3319 static inline SimdDouble gmx_simdcall pmePotentialCorrection(SimdDouble z2)
3321 const SimdDouble VN9(-9.3723776169321855475e-13);
3322 const SimdDouble VN8(1.2280156762674215741e-10);
3323 const SimdDouble VN7(-7.3562157912251309487e-9);
3324 const SimdDouble VN6(2.6215886208032517509e-7);
3325 const SimdDouble VN5(-4.9532491651265819499e-6);
3326 const SimdDouble VN4(0.00025907400778966060389);
3327 const SimdDouble VN3(0.0010585044856156469792);
3328 const SimdDouble VN2(0.045247661136833092885);
3329 const SimdDouble VN1(0.11643931522926034421);
3330 const SimdDouble VN0(1.1283791671726767970);
3332 const SimdDouble VD5(0.000021784709867336150342);
3333 const SimdDouble VD4(0.00064293662010911388448);
3334 const SimdDouble VD3(0.0096311444822588683504);
3335 const SimdDouble VD2(0.085608012351550627051);
3336 const SimdDouble VD1(0.43652499166614811084);
3337 const SimdDouble VD0(1.0);
3340 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
3344 polyVD1 = fma(VD5, z4, VD3);
3345 polyVD0 = fma(VD4, z4, VD2);
3346 polyVD1 = fma(polyVD1, z4, VD1);
3347 polyVD0 = fma(polyVD0, z4, VD0);
3348 polyVD0 = fma(polyVD1, z2, polyVD0);
3350 polyVD0 = inv(polyVD0);
3352 polyVN1 = fma(VN9, z4, VN7);
3353 polyVN0 = fma(VN8, z4, VN6);
3354 polyVN1 = fma(polyVN1, z4, VN5);
3355 polyVN0 = fma(polyVN0, z4, VN4);
3356 polyVN1 = fma(polyVN1, z4, VN3);
3357 polyVN0 = fma(polyVN0, z4, VN2);
3358 polyVN1 = fma(polyVN1, z4, VN1);
3359 polyVN0 = fma(polyVN0, z4, VN0);
3360 polyVN0 = fma(polyVN1, z2, polyVN0);
3362 return polyVN0 * polyVD0;
3368 /*! \name SIMD math functions for double prec. data, single prec. accuracy
3370 * \note In some cases we do not need full double accuracy of individual
3371 * SIMD math functions, although the data is stored in double precision
3372 * SIMD registers. This might be the case for special algorithms, or
3373 * if the architecture does not support single precision.
3374 * Since the full double precision evaluation of math functions
3375 * typically require much more expensive polynomial approximations
3376 * these functions implement the algorithms used in the single precision
3377 * SIMD math functions, but they operate on double precision
3383 /*********************************************************************
3384 * SIMD MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
3385 *********************************************************************/
3387 /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
3389 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3390 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3391 * For the single precision implementation this is obviously always
3392 * true for positive values, but for double precision it adds an
3393 * extra restriction since the first lookup step might have to be
3394 * performed in single precision on some architectures. Note that the
3395 * responsibility for checking falls on you - this routine does not
3398 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
3400 static inline SimdDouble gmx_simdcall invsqrtSingleAccuracy(SimdDouble x)
3402 SimdDouble lu = rsqrt(x);
3403 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3404 lu = rsqrtIter(lu, x);
3406 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3407 lu = rsqrtIter(lu, x);
3409 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3410 lu = rsqrtIter(lu, x);
3415 /*! \brief 1/sqrt(x) for masked-in entries of SIMD double, but in single accuracy.
3417 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
3418 * Illegal values in the masked-out elements will not lead to
3419 * floating-point exceptions.
3421 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
3422 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3423 * For the single precision implementation this is obviously always
3424 * true for positive values, but for double precision it adds an
3425 * extra restriction since the first lookup step might have to be
3426 * performed in single precision on some architectures. Note that the
3427 * responsibility for checking falls on you - this routine does not
3431 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
3432 * entry was not masked, and 0.0 for masked-out entries.
3434 static inline SimdDouble maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
3436 SimdDouble lu = maskzRsqrt(x, m);
3437 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3438 lu = rsqrtIter(lu, x);
3440 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3441 lu = rsqrtIter(lu, x);
3443 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3444 lu = rsqrtIter(lu, x);
3449 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
3451 * \param x0 First set of arguments, x0 must be in single range (see below).
3452 * \param x1 Second set of arguments, x1 must be in single range (see below).
3453 * \param[out] out0 Result 1/sqrt(x0)
3454 * \param[out] out1 Result 1/sqrt(x1)
3456 * In particular for double precision we can sometimes calculate square root
3457 * pairs slightly faster by using single precision until the very last step.
3459 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
3460 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3461 * For the single precision implementation this is obviously always
3462 * true for positive values, but for double precision it adds an
3463 * extra restriction since the first lookup step might have to be
3464 * performed in single precision on some architectures. Note that the
3465 * responsibility for checking falls on you - this routine does not
3468 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdDouble x0,
3473 # if GMX_SIMD_HAVE_FLOAT && (GMX_SIMD_FLOAT_WIDTH == 2 * GMX_SIMD_DOUBLE_WIDTH) \
3474 && (GMX_SIMD_RSQRT_BITS < 22)
3475 SimdFloat xf = cvtDD2F(x0, x1);
3476 SimdFloat luf = rsqrt(xf);
3477 SimdDouble lu0, lu1;
3478 // Intermediate target is single - mantissa+1 bits
3479 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3480 luf = rsqrtIter(luf, xf);
3482 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3483 luf = rsqrtIter(luf, xf);
3485 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3486 luf = rsqrtIter(luf, xf);
3488 cvtF2DD(luf, &lu0, &lu1);
3489 // We now have single-precision accuracy values in lu0/lu1
3493 *out0 = invsqrtSingleAccuracy(x0);
3494 *out1 = invsqrtSingleAccuracy(x1);
3498 /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
3500 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3501 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3502 * For the single precision implementation this is obviously always
3503 * true for positive values, but for double precision it adds an
3504 * extra restriction since the first lookup step might have to be
3505 * performed in single precision on some architectures. Note that the
3506 * responsibility for checking falls on you - this routine does not
3509 * \return 1/x. Result is undefined if your argument was invalid.
3511 static inline SimdDouble gmx_simdcall invSingleAccuracy(SimdDouble x)
3513 SimdDouble lu = rcp(x);
3514 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3515 lu = rcpIter(lu, x);
3517 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3518 lu = rcpIter(lu, x);
3520 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3521 lu = rcpIter(lu, x);
3526 /*! \brief 1/x for masked entries of SIMD double, single accuracy.
3528 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
3529 * GMX_FLOAT_MAX, i.e. within the range of single precision.
3530 * For the single precision implementation this is obviously always
3531 * true for positive values, but for double precision it adds an
3532 * extra restriction since the first lookup step might have to be
3533 * performed in single precision on some architectures. Note that the
3534 * responsibility for checking falls on you - this routine does not
3538 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
3540 static inline SimdDouble gmx_simdcall maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
3542 SimdDouble lu = maskzRcp(x, m);
3543 # if (GMX_SIMD_RCP_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
3544 lu = rcpIter(lu, x);
3546 # if (GMX_SIMD_RCP_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3547 lu = rcpIter(lu, x);
3549 # if (GMX_SIMD_RCP_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
3550 lu = rcpIter(lu, x);
3556 /*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
3558 * \copydetails sqrt(SimdFloat)
3560 template<MathOptimization opt = MathOptimization::Safe>
3561 static inline SimdDouble gmx_simdcall sqrtSingleAccuracy(SimdDouble x)
3563 if (opt == MathOptimization::Safe)
3565 SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
3570 return x * invsqrtSingleAccuracy(x);
3574 /*! \brief Cube root for SIMD doubles, single accuracy.
3576 * \param x Argument to calculate cube root of. Can be negative or zero,
3577 * but NaN or Inf values are not supported. Denormal values will
3578 * be treated as 0.0.
3579 * \return Cube root of x.
3581 static inline SimdDouble gmx_simdcall cbrtSingleAccuracy(SimdDouble x)
3583 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3584 const SimdDouble minDouble(std::numeric_limits<double>::min());
3585 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3586 // Use the divided value as original constant to avoid division warnings.
3587 const std::int32_t offsetDiv3(341);
3588 const SimdDouble c2(-0.191502161678719066);
3589 const SimdDouble c1(0.697570460207922770);
3590 const SimdDouble c0(0.492659620528969547);
3591 const SimdDouble one(1.0);
3592 const SimdDouble two(2.0);
3593 const SimdDouble three(3.0);
3594 const SimdDouble oneThird(1.0 / 3.0);
3595 const SimdDouble cbrt2(1.2599210498948731648);
3596 const SimdDouble sqrCbrt2(1.5874010519681994748);
3598 // See the single precision routines for documentation of the algorithm
3600 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3601 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3602 SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
3604 SimdDInt32 exponent;
3605 SimdDouble y = frexp(xAbs, &exponent);
3606 SimdDouble z = fma(fma(y, c2, c1), y, c0);
3607 SimdDouble w = z * z * z;
3608 SimdDouble nom = z * fma(two, y, w);
3609 SimdDouble invDenom = inv(fma(two, w, y));
3611 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3612 SimdDouble offsetExpDiv3 =
3613 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3614 SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offsetDiv3)));
3615 SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
3616 SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
3617 factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
3618 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
3619 SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
3623 /*! \brief Inverse cube root for SIMD doubles, single accuracy.
3625 * \param x Argument to calculate cube root of. Can be positive or
3626 * negative, but the magnitude cannot be lower than
3627 * the smallest normal number.
3628 * \return Cube root of x. Undefined for values that don't
3629 * fulfill the restriction of abs(x) > minDouble.
3631 static inline SimdDouble gmx_simdcall invcbrtSingleAccuracy(SimdDouble x)
3633 const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
3634 // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
3635 // Use the divided value as original constant to avoid division warnings.
3636 const std::int32_t offsetDiv3(341);
3637 const SimdDouble c2(-0.191502161678719066);
3638 const SimdDouble c1(0.697570460207922770);
3639 const SimdDouble c0(0.492659620528969547);
3640 const SimdDouble one(1.0);
3641 const SimdDouble two(2.0);
3642 const SimdDouble three(3.0);
3643 const SimdDouble oneThird(1.0 / 3.0);
3644 const SimdDouble invCbrt2(1.0 / 1.2599210498948731648);
3645 const SimdDouble invSqrCbrt2(1.0F / 1.5874010519681994748);
3647 // See the single precision routines for documentation of the algorithm
3649 SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
3650 SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
3652 SimdDInt32 exponent;
3653 SimdDouble y = frexp(xAbs, &exponent);
3654 SimdDouble z = fma(fma(y, c2, c1), y, c0);
3655 SimdDouble w = z * z * z;
3656 SimdDouble nom = fma(two, w, y);
3657 SimdDouble invDenom = inv(z * fma(two, y, w));
3658 SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(3 * offsetDiv3) + 0.1);
3659 SimdDouble offsetExpDiv3 =
3660 trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
3661 SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offsetDiv3)) - offsetExpDiv3);
3662 SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
3663 SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5));
3664 factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
3665 SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
3666 SimdDouble result = ldexp(fraction, expDiv3);
3670 /*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
3672 * \param x Argument, should be >0.
3673 * \result The base 2 logarithm of x. Undefined if argument is invalid.
3675 static inline SimdDouble gmx_simdcall log2SingleAccuracy(SimdDouble x)
3677 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3678 return log(x) * SimdDouble(std::log2(std::exp(1.0)));
3680 const SimdDouble one(1.0);
3681 const SimdDouble two(2.0);
3682 const SimdDouble sqrt2(std::sqrt(2.0));
3683 const SimdDouble CL9(0.342149508897807708152F);
3684 const SimdDouble CL7(0.411570606888219447939F);
3685 const SimdDouble CL5(0.577085979152320294183F);
3686 const SimdDouble CL3(0.961796550607099898222F);
3687 const SimdDouble CL1(2.885390081777926774009F);
3688 SimdDouble fexp, x2, p;
3692 x = frexp(x, &iexp);
3693 fexp = cvtI2R(iexp);
3696 // Adjust to non-IEEE format for x<sqrt(2): exponent -= 1, mantissa *= 2.0
3697 fexp = fexp - selectByMask(one, mask);
3698 x = x * blend(one, two, mask);
3700 x = (x - one) * invSingleAccuracy(x + one);
3703 p = fma(CL9, x2, CL7);
3704 p = fma(p, x2, CL5);
3705 p = fma(p, x2, CL3);
3706 p = fma(p, x2, CL1);
3707 p = fma(p, x, fexp);
3713 /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
3715 * \param x Argument, should be >0.
3716 * \result The natural logarithm of x. Undefined if argument is invalid.
3718 static inline SimdDouble gmx_simdcall logSingleAccuracy(SimdDouble x)
3720 # if GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
3723 const SimdDouble one(1.0);
3724 const SimdDouble two(2.0);
3725 const SimdDouble invsqrt2(1.0 / std::sqrt(2.0));
3726 const SimdDouble corr(0.693147180559945286226764);
3727 const SimdDouble CL9(0.2371599674224853515625);
3728 const SimdDouble CL7(0.285279005765914916992188);
3729 const SimdDouble CL5(0.400005519390106201171875);
3730 const SimdDouble CL3(0.666666567325592041015625);
3731 const SimdDouble CL1(2.0);
3732 SimdDouble fexp, x2, p;
3736 x = frexp(x, &iexp);
3737 fexp = cvtI2R(iexp);
3739 mask = x < invsqrt2;
3740 // Adjust to non-IEEE format for x<1/sqrt(2): exponent -= 1, mantissa *= 2.0
3741 fexp = fexp - selectByMask(one, mask);
3742 x = x * blend(one, two, mask);
3744 x = (x - one) * invSingleAccuracy(x + one);
3747 p = fma(CL9, x2, CL7);
3748 p = fma(p, x2, CL5);
3749 p = fma(p, x2, CL3);
3750 p = fma(p, x2, CL1);
3751 p = fma(p, x, corr * fexp);
3757 /*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
3759 * \copydetails exp2(SimdFloat)
3761 template<MathOptimization opt = MathOptimization::Safe>
3762 static inline SimdDouble gmx_simdcall exp2SingleAccuracy(SimdDouble x)
3764 # if GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
3767 const SimdDouble CC6(0.0001534581200287996416911311);
3768 const SimdDouble CC5(0.001339993121934088894618990);
3769 const SimdDouble CC4(0.009618488957115180159497841);
3770 const SimdDouble CC3(0.05550328776964726865751735);
3771 const SimdDouble CC2(0.2402264689063408646490722);
3772 const SimdDouble CC1(0.6931472057372680777553816);
3773 const SimdDouble one(1.0);
3779 // Large negative values are valid arguments to exp2(), so there are two
3780 // things we need to account for:
3781 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3782 // zero and we can no longer multiply with it. There are special IEEE
3783 // formats to handle this range, but for now we have to accept that
3784 // we cannot handle those arguments. If input value becomes even more
3785 // negative, it will start to loop and we would end up with invalid
3786 // exponents. Thus, we need to limit or mask this.
3787 // 2. For VERY large negative values, we will have problems that the
3788 // subtraction to get the fractional part loses accuracy, and then we
3789 // can end up with overflows in the polynomial.
3791 // For now, we handle this by forwarding the math optimization setting to
3792 // ldexp, where the routine will return zero for very small arguments.
3794 // However, before doing that we need to make sure we do not call cvtR2I
3795 // with an argument that is so negative it cannot be converted to an integer.
3796 if (opt == MathOptimization::Safe)
3798 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()));
3805 p = fma(CC6, x, CC5);
3811 x = ldexp<opt>(p, ix);
3818 /*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
3820 * \copydetails exp(SimdFloat)
3822 template<MathOptimization opt = MathOptimization::Safe>
3823 static inline SimdDouble gmx_simdcall expSingleAccuracy(SimdDouble x)
3825 # if GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
3828 const SimdDouble argscale(1.44269504088896341);
3829 // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
3830 const SimdDouble smallArgLimit(-709.0895657128);
3831 const SimdDouble invargscale(-0.69314718055994528623);
3832 const SimdDouble CC4(0.00136324646882712841033936);
3833 const SimdDouble CC3(0.00836596917361021041870117);
3834 const SimdDouble CC2(0.0416710823774337768554688);
3835 const SimdDouble CC1(0.166665524244308471679688);
3836 const SimdDouble CC0(0.499999850988388061523438);
3837 const SimdDouble one(1.0);
3842 // Large negative values are valid arguments to exp2(), so there are two
3843 // things we need to account for:
3844 // 1. When the exponents reaches -1023, the (biased) exponent field will be
3845 // zero and we can no longer multiply with it. There are special IEEE
3846 // formats to handle this range, but for now we have to accept that
3847 // we cannot handle those arguments. If input value becomes even more
3848 // negative, it will start to loop and we would end up with invalid
3849 // exponents. Thus, we need to limit or mask this.
3850 // 2. For VERY large negative values, we will have problems that the
3851 // subtraction to get the fractional part loses accuracy, and then we
3852 // can end up with overflows in the polynomial.
3854 // For now, we handle this by forwarding the math optimization setting to
3855 // ldexp, where the routine will return zero for very small arguments.
3857 // However, before doing that we need to make sure we do not call cvtR2I
3858 // with an argument that is so negative it cannot be converted to an integer
3859 // after being multiplied by argscale.
3861 if (opt == MathOptimization::Safe)
3863 x = max(x, SimdDouble(std::numeric_limits<std::int32_t>::lowest()) / argscale);
3869 intpart = round(y); // use same rounding algorithm here
3871 // Extended precision arithmetics not needed since
3872 // we have double precision and only need single accuracy.
3873 x = fma(invargscale, intpart, x);
3875 p = fma(CC4, x, CC3);
3879 p = fma(x * x, p, x);
3881 x = ldexp<opt>(p, iy);
3886 /*! \brief SIMD pow(x,y). Double precision SIMD data, single accuracy.
3888 * This returns x^y for SIMD values.
3890 * \tparam opt If this is changed from the default (safe) into the unsafe
3891 * option, there are no guarantees about correct results for x==0.
3895 * \param y exponent.
3897 * \result x^y. Overflowing arguments are likely to either return 0 or inf,
3898 * depending on the underlying implementation. If unsafe optimizations
3899 * are enabled, this is also true for x==0.
3901 * \warning You cannot rely on this implementation returning inf for arguments
3902 * that cause overflow. If you have some very large
3903 * values and need to rely on getting a valid numerical output,
3904 * take the minimum of your variable and the largest valid argument
3905 * before calling this routine.
3907 template<MathOptimization opt = MathOptimization::Safe>
3908 static inline SimdDouble gmx_simdcall powSingleAccuracy(SimdDouble x, SimdDouble y)
3912 if (opt == MathOptimization::Safe)
3914 xcorr = max(x, SimdDouble(std::numeric_limits<double>::min()));
3921 SimdDouble result = exp2SingleAccuracy<opt>(y * log2SingleAccuracy(xcorr));
3923 if (opt == MathOptimization::Safe)
3925 // if x==0 and y>0 we explicitly set the result to 0.0
3926 // For any x with y==0, the result will already be 1.0 since we multiply by y (0.0) and call exp().
3927 result = blend(result, setZero(), x == setZero() && setZero() < y);
3933 /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
3935 * \param x The value to calculate erf(x) for.
3938 * This routine achieves very close to single precision, but we do not care about
3939 * the last bit or the subnormal result range.
3941 static inline SimdDouble gmx_simdcall erfSingleAccuracy(SimdDouble x)
3943 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
3944 const SimdDouble CA6(7.853861353153693e-5);
3945 const SimdDouble CA5(-8.010193625184903e-4);
3946 const SimdDouble CA4(5.188327685732524e-3);
3947 const SimdDouble CA3(-2.685381193529856e-2);
3948 const SimdDouble CA2(1.128358514861418e-1);
3949 const SimdDouble CA1(-3.761262582423300e-1);
3950 const SimdDouble CA0(1.128379165726710);
3951 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
3952 const SimdDouble CB9(-0.0018629930017603923);
3953 const SimdDouble CB8(0.003909821287598495);
3954 const SimdDouble CB7(-0.0052094582210355615);
3955 const SimdDouble CB6(0.005685614362160572);
3956 const SimdDouble CB5(-0.0025367682853477272);
3957 const SimdDouble CB4(-0.010199799682318782);
3958 const SimdDouble CB3(0.04369575504816542);
3959 const SimdDouble CB2(-0.11884063474674492);
3960 const SimdDouble CB1(0.2732120154030589);
3961 const SimdDouble CB0(0.42758357702025784);
3962 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
3963 const SimdDouble CC10(-0.0445555913112064);
3964 const SimdDouble CC9(0.21376355144663348);
3965 const SimdDouble CC8(-0.3473187200259257);
3966 const SimdDouble CC7(0.016690861551248114);
3967 const SimdDouble CC6(0.7560973182491192);
3968 const SimdDouble CC5(-1.2137903600145787);
3969 const SimdDouble CC4(0.8411872321232948);
3970 const SimdDouble CC3(-0.08670413896296343);
3971 const SimdDouble CC2(-0.27124782687240334);
3972 const SimdDouble CC1(-0.0007502488047806069);
3973 const SimdDouble CC0(0.5642114853803148);
3974 const SimdDouble one(1.0);
3975 const SimdDouble two(2.0);
3977 SimdDouble x2, x4, y;
3978 SimdDouble t, t2, w, w2;
3979 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
3981 SimdDouble res_erf, res_erfc, res;
3982 SimdDBool mask, msk_erf;
3988 pA0 = fma(CA6, x4, CA4);
3989 pA1 = fma(CA5, x4, CA3);
3990 pA0 = fma(pA0, x4, CA2);
3991 pA1 = fma(pA1, x4, CA1);
3993 pA0 = fma(pA1, x2, pA0);
3994 // Constant term must come last for precision reasons
4001 msk_erf = (SimdDouble(0.75) <= y);
4002 t = maskzInvSingleAccuracy(y, msk_erf);
4007 expmx2 = expSingleAccuracy(-y * y);
4009 pB1 = fma(CB9, w2, CB7);
4010 pB0 = fma(CB8, w2, CB6);
4011 pB1 = fma(pB1, w2, CB5);
4012 pB0 = fma(pB0, w2, CB4);
4013 pB1 = fma(pB1, w2, CB3);
4014 pB0 = fma(pB0, w2, CB2);
4015 pB1 = fma(pB1, w2, CB1);
4016 pB0 = fma(pB0, w2, CB0);
4017 pB0 = fma(pB1, w, pB0);
4019 pC0 = fma(CC10, t2, CC8);
4020 pC1 = fma(CC9, t2, CC7);
4021 pC0 = fma(pC0, t2, CC6);
4022 pC1 = fma(pC1, t2, CC5);
4023 pC0 = fma(pC0, t2, CC4);
4024 pC1 = fma(pC1, t2, CC3);
4025 pC0 = fma(pC0, t2, CC2);
4026 pC1 = fma(pC1, t2, CC1);
4028 pC0 = fma(pC0, t2, CC0);
4029 pC0 = fma(pC1, t, pC0);
4032 // Select pB0 or pC0 for erfc()
4034 res_erfc = blend(pB0, pC0, mask);
4035 res_erfc = res_erfc * expmx2;
4037 // erfc(x<0) = 2-erfc(|x|)
4038 mask = (x < setZero());
4039 res_erfc = blend(res_erfc, two - res_erfc, mask);
4041 // Select erf() or erfc()
4042 mask = (y < SimdDouble(0.75));
4043 res = blend(one - res_erfc, res_erf, mask);
4048 /*! \brief SIMD erfc(x). Double precision SIMD data, single accuracy.
4050 * \param x The value to calculate erfc(x) for.
4053 * This routine achieves singleprecision (bar the last bit) over most of the
4054 * input range, but for large arguments where the result is getting close
4055 * to the minimum representable numbers we accept slightly larger errors
4056 * (think results that are in the ballpark of 10^-30) since that is not
4059 static inline SimdDouble gmx_simdcall erfcSingleAccuracy(SimdDouble x)
4061 // Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1]
4062 const SimdDouble CA6(7.853861353153693e-5);
4063 const SimdDouble CA5(-8.010193625184903e-4);
4064 const SimdDouble CA4(5.188327685732524e-3);
4065 const SimdDouble CA3(-2.685381193529856e-2);
4066 const SimdDouble CA2(1.128358514861418e-1);
4067 const SimdDouble CA1(-3.761262582423300e-1);
4068 const SimdDouble CA0(1.128379165726710);
4069 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2]
4070 const SimdDouble CB9(-0.0018629930017603923);
4071 const SimdDouble CB8(0.003909821287598495);
4072 const SimdDouble CB7(-0.0052094582210355615);
4073 const SimdDouble CB6(0.005685614362160572);
4074 const SimdDouble CB5(-0.0025367682853477272);
4075 const SimdDouble CB4(-0.010199799682318782);
4076 const SimdDouble CB3(0.04369575504816542);
4077 const SimdDouble CB2(-0.11884063474674492);
4078 const SimdDouble CB1(0.2732120154030589);
4079 const SimdDouble CB0(0.42758357702025784);
4080 // Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19]
4081 const SimdDouble CC10(-0.0445555913112064);
4082 const SimdDouble CC9(0.21376355144663348);
4083 const SimdDouble CC8(-0.3473187200259257);
4084 const SimdDouble CC7(0.016690861551248114);
4085 const SimdDouble CC6(0.7560973182491192);
4086 const SimdDouble CC5(-1.2137903600145787);
4087 const SimdDouble CC4(0.8411872321232948);
4088 const SimdDouble CC3(-0.08670413896296343);
4089 const SimdDouble CC2(-0.27124782687240334);
4090 const SimdDouble CC1(-0.0007502488047806069);
4091 const SimdDouble CC0(0.5642114853803148);
4092 const SimdDouble one(1.0);
4093 const SimdDouble two(2.0);
4095 SimdDouble x2, x4, y;
4096 SimdDouble t, t2, w, w2;
4097 SimdDouble pA0, pA1, pB0, pB1, pC0, pC1;
4099 SimdDouble res_erf, res_erfc, res;
4100 SimdDBool mask, msk_erf;
4106 pA0 = fma(CA6, x4, CA4);
4107 pA1 = fma(CA5, x4, CA3);
4108 pA0 = fma(pA0, x4, CA2);
4109 pA1 = fma(pA1, x4, CA1);
4111 pA0 = fma(pA0, x4, pA1);
4112 // Constant term must come last for precision reasons
4119 msk_erf = (SimdDouble(0.75) <= y);
4120 t = maskzInvSingleAccuracy(y, msk_erf);
4125 expmx2 = expSingleAccuracy(-y * y);
4127 pB1 = fma(CB9, w2, CB7);
4128 pB0 = fma(CB8, w2, CB6);
4129 pB1 = fma(pB1, w2, CB5);
4130 pB0 = fma(pB0, w2, CB4);
4131 pB1 = fma(pB1, w2, CB3);
4132 pB0 = fma(pB0, w2, CB2);
4133 pB1 = fma(pB1, w2, CB1);
4134 pB0 = fma(pB0, w2, CB0);
4135 pB0 = fma(pB1, w, pB0);
4137 pC0 = fma(CC10, t2, CC8);
4138 pC1 = fma(CC9, t2, CC7);
4139 pC0 = fma(pC0, t2, CC6);
4140 pC1 = fma(pC1, t2, CC5);
4141 pC0 = fma(pC0, t2, CC4);
4142 pC1 = fma(pC1, t2, CC3);
4143 pC0 = fma(pC0, t2, CC2);
4144 pC1 = fma(pC1, t2, CC1);
4146 pC0 = fma(pC0, t2, CC0);
4147 pC0 = fma(pC1, t, pC0);
4150 // Select pB0 or pC0 for erfc()
4152 res_erfc = blend(pB0, pC0, mask);
4153 res_erfc = res_erfc * expmx2;
4155 // erfc(x<0) = 2-erfc(|x|)
4156 mask = (x < setZero());
4157 res_erfc = blend(res_erfc, two - res_erfc, mask);
4159 // Select erf() or erfc()
4160 mask = (y < SimdDouble(0.75));
4161 res = blend(res_erfc, one - res_erf, mask);
4166 /*! \brief SIMD sin \& cos. Double precision SIMD data, single accuracy.
4168 * \param x The argument to evaluate sin/cos for
4169 * \param[out] sinval Sin(x)
4170 * \param[out] cosval Cos(x)
4172 static inline void gmx_simdcall sinCosSingleAccuracy(SimdDouble x, SimdDouble* sinval, SimdDouble* cosval)
4174 // Constants to subtract Pi/4*x from y while minimizing precision loss
4175 const SimdDouble argred0(2 * 0.78539816290140151978);
4176 const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4177 const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4178 const SimdDouble two_over_pi(2.0 / M_PI);
4179 const SimdDouble const_sin2(-1.9515295891e-4);
4180 const SimdDouble const_sin1(8.3321608736e-3);
4181 const SimdDouble const_sin0(-1.6666654611e-1);
4182 const SimdDouble const_cos2(2.443315711809948e-5);
4183 const SimdDouble const_cos1(-1.388731625493765e-3);
4184 const SimdDouble const_cos0(4.166664568298827e-2);
4186 const SimdDouble half(0.5);
4187 const SimdDouble one(1.0);
4188 SimdDouble ssign, csign;
4189 SimdDouble x2, y, z, psin, pcos, sss, ccc;
4192 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4193 const SimdDInt32 ione(1);
4194 const SimdDInt32 itwo(2);
4197 z = x * two_over_pi;
4201 mask = cvtIB2B((iy & ione) == setZero());
4202 ssign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B((iy & itwo) == itwo));
4203 csign = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), cvtIB2B(((iy + ione) & itwo) == itwo));
4205 const SimdDouble quarter(0.25);
4206 const SimdDouble minusquarter(-0.25);
4208 SimdDBool m1, m2, m3;
4210 /* The most obvious way to find the arguments quadrant in the unit circle
4211 * to calculate the sign is to use integer arithmetic, but that is not
4212 * present in all SIMD implementations. As an alternative, we have devised a
4213 * pure floating-point algorithm that uses truncation for argument reduction
4214 * so that we get a new value 0<=q<1 over the unit circle, and then
4215 * do floating-point comparisons with fractions. This is likely to be
4216 * slightly slower (~10%) due to the longer latencies of floating-point, so
4217 * we only use it when integer SIMD arithmetic is not present.
4221 // It is critical that half-way cases are rounded down
4222 z = fma(x, two_over_pi, half);
4226 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
4227 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
4228 * This removes the 2*Pi periodicity without using any integer arithmetic.
4229 * First check if y had the value 2 or 3, set csign if true.
4232 /* If we have logical operations we can work directly on the signbit, which
4233 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
4234 * Thus, if you are altering defines to debug alternative code paths, the
4235 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
4236 * active or inactive - you will get errors if only one is used.
4238 # if GMX_SIMD_HAVE_LOGICAL
4239 ssign = ssign & SimdDouble(GMX_DOUBLE_NEGZERO);
4240 csign = andNot(q, SimdDouble(GMX_DOUBLE_NEGZERO));
4241 ssign = ssign ^ csign;
4243 ssign = copysign(SimdDouble(1.0), ssign);
4244 csign = copysign(SimdDouble(1.0), q);
4246 ssign = ssign * csign; // swap ssign if csign was set.
4248 // Check if y had value 1 or 3 (remember we subtracted 0.5 from q)
4249 m1 = (q < minusquarter);
4250 m2 = (setZero() <= q);
4254 // where mask is FALSE, swap sign.
4255 csign = csign * blend(SimdDouble(-1.0), one, mask);
4257 x = fnma(y, argred0, x);
4258 x = fnma(y, argred1, x);
4259 x = fnma(y, argred2, x);
4262 psin = fma(const_sin2, x2, const_sin1);
4263 psin = fma(psin, x2, const_sin0);
4264 psin = fma(psin, x * x2, x);
4265 pcos = fma(const_cos2, x2, const_cos1);
4266 pcos = fma(pcos, x2, const_cos0);
4267 pcos = fms(pcos, x2, half);
4268 pcos = fma(pcos, x2, one);
4270 sss = blend(pcos, psin, mask);
4271 ccc = blend(psin, pcos, mask);
4272 // See comment for GMX_SIMD_HAVE_LOGICAL section above.
4273 # if GMX_SIMD_HAVE_LOGICAL
4274 *sinval = sss ^ ssign;
4275 *cosval = ccc ^ csign;
4277 *sinval = sss * ssign;
4278 *cosval = ccc * csign;
4282 /*! \brief SIMD sin(x). Double precision SIMD data, single accuracy.
4284 * \param x The argument to evaluate sin for
4287 * \attention Do NOT call both sin & cos if you need both results, since each of them
4288 * will then call \ref sincos and waste a factor 2 in performance.
4290 static inline SimdDouble gmx_simdcall sinSingleAccuracy(SimdDouble x)
4293 sinCosSingleAccuracy(x, &s, &c);
4297 /*! \brief SIMD cos(x). Double precision SIMD data, single accuracy.
4299 * \param x The argument to evaluate cos for
4302 * \attention Do NOT call both sin & cos if you need both results, since each of them
4303 * will then call \ref sincos and waste a factor 2 in performance.
4305 static inline SimdDouble gmx_simdcall cosSingleAccuracy(SimdDouble x)
4308 sinCosSingleAccuracy(x, &s, &c);
4312 /*! \brief SIMD tan(x). Double precision SIMD data, single accuracy.
4314 * \param x The argument to evaluate tan for
4317 static inline SimdDouble gmx_simdcall tanSingleAccuracy(SimdDouble x)
4319 const SimdDouble argred0(2 * 0.78539816290140151978);
4320 const SimdDouble argred1(2 * 4.9604678871439933374e-10);
4321 const SimdDouble argred2(2 * 1.1258708853173288931e-18);
4322 const SimdDouble two_over_pi(2.0 / M_PI);
4323 const SimdDouble CT6(0.009498288995810566122993911);
4324 const SimdDouble CT5(0.002895755790837379295226923);
4325 const SimdDouble CT4(0.02460087336161924491836265);
4326 const SimdDouble CT3(0.05334912882656359828045988);
4327 const SimdDouble CT2(0.1333989091464957704418495);
4328 const SimdDouble CT1(0.3333307599244198227797507);
4330 SimdDouble x2, p, y, z;
4333 # if GMX_SIMD_HAVE_DINT32_ARITHMETICS && GMX_SIMD_HAVE_LOGICAL
4337 z = x * two_over_pi;
4340 mask = cvtIB2B((iy & ione) == ione);
4342 x = fnma(y, argred0, x);
4343 x = fnma(y, argred1, x);
4344 x = fnma(y, argred2, x);
4345 x = selectByMask(SimdDouble(GMX_DOUBLE_NEGZERO), mask) ^ x;
4347 const SimdDouble quarter(0.25);
4348 const SimdDouble half(0.5);
4349 const SimdDouble threequarter(0.75);
4351 SimdDBool m1, m2, m3;
4354 z = fma(w, two_over_pi, half);
4358 m1 = (quarter <= q);
4360 m3 = (threequarter <= q);
4363 w = fnma(y, argred0, w);
4364 w = fnma(y, argred1, w);
4365 w = fnma(y, argred2, w);
4367 w = blend(w, -w, mask);
4368 x = w * copysign(SimdDouble(1.0), x);
4371 p = fma(CT6, x2, CT5);
4372 p = fma(p, x2, CT4);
4373 p = fma(p, x2, CT3);
4374 p = fma(p, x2, CT2);
4375 p = fma(p, x2, CT1);
4376 p = fma(x2, p * x, x);
4378 p = blend(p, maskzInvSingleAccuracy(p, mask), mask);
4382 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4384 * \param x The argument to evaluate asin for
4387 static inline SimdDouble gmx_simdcall asinSingleAccuracy(SimdDouble x)
4389 const SimdDouble limitlow(1e-4);
4390 const SimdDouble half(0.5);
4391 const SimdDouble one(1.0);
4392 const SimdDouble halfpi(M_PI / 2.0);
4393 const SimdDouble CC5(4.2163199048E-2);
4394 const SimdDouble CC4(2.4181311049E-2);
4395 const SimdDouble CC3(4.5470025998E-2);
4396 const SimdDouble CC2(7.4953002686E-2);
4397 const SimdDouble CC1(1.6666752422E-1);
4399 SimdDouble z, z1, z2, q, q1, q2;
4401 SimdDBool mask, mask2;
4404 mask = (half < xabs);
4405 z1 = half * (one - xabs);
4406 mask2 = (xabs < one);
4407 q1 = z1 * maskzInvsqrtSingleAccuracy(z1, mask2);
4410 z = blend(z2, z1, mask);
4411 q = blend(q2, q1, mask);
4414 pA = fma(CC5, z2, CC3);
4415 pB = fma(CC4, z2, CC2);
4416 pA = fma(pA, z2, CC1);
4418 z = fma(pB, z2, pA);
4422 z = blend(z, q2, mask);
4424 mask = (limitlow < xabs);
4425 z = blend(xabs, z, mask);
4431 /*! \brief SIMD acos(x). Double precision SIMD data, single accuracy.
4433 * \param x The argument to evaluate acos for
4436 static inline SimdDouble gmx_simdcall acosSingleAccuracy(SimdDouble x)
4438 const SimdDouble one(1.0);
4439 const SimdDouble half(0.5);
4440 const SimdDouble pi(M_PI);
4441 const SimdDouble halfpi(M_PI / 2.0);
4443 SimdDouble z, z1, z2, z3;
4444 SimdDBool mask1, mask2, mask3;
4447 mask1 = (half < xabs);
4448 mask2 = (setZero() < x);
4450 z = half * (one - xabs);
4451 mask3 = (xabs < one);
4452 z = z * maskzInvsqrtSingleAccuracy(z, mask3);
4453 z = blend(x, z, mask1);
4454 z = asinSingleAccuracy(z);
4459 z = blend(z1, z2, mask2);
4460 z = blend(z3, z, mask1);
4465 /*! \brief SIMD asin(x). Double precision SIMD data, single accuracy.
4467 * \param x The argument to evaluate atan for
4468 * \result Atan(x), same argument/value range as standard math library.
4470 static inline SimdDouble gmx_simdcall atanSingleAccuracy(SimdDouble x)
4472 const SimdDouble halfpi(M_PI / 2);
4473 const SimdDouble CA17(0.002823638962581753730774);
4474 const SimdDouble CA15(-0.01595690287649631500244);
4475 const SimdDouble CA13(0.04250498861074447631836);
4476 const SimdDouble CA11(-0.07489009201526641845703);
4477 const SimdDouble CA9(0.1063479334115982055664);
4478 const SimdDouble CA7(-0.1420273631811141967773);
4479 const SimdDouble CA5(0.1999269574880599975585);
4480 const SimdDouble CA3(-0.3333310186862945556640);
4481 SimdDouble x2, x3, x4, pA, pB;
4482 SimdDBool mask, mask2;
4484 mask = (x < setZero());
4486 mask2 = (SimdDouble(1.0) < x);
4487 x = blend(x, maskzInvSingleAccuracy(x, mask2), mask2);
4492 pA = fma(CA17, x4, CA13);
4493 pB = fma(CA15, x4, CA11);
4494 pA = fma(pA, x4, CA9);
4495 pB = fma(pB, x4, CA7);
4496 pA = fma(pA, x4, CA5);
4497 pB = fma(pB, x4, CA3);
4498 pA = fma(pA, x2, pB);
4499 pA = fma(pA, x3, x);
4501 pA = blend(pA, halfpi - pA, mask2);
4502 pA = blend(pA, -pA, mask);
4507 /*! \brief SIMD atan2(y,x). Double precision SIMD data, single accuracy.
4509 * \param y Y component of vector, any quartile
4510 * \param x X component of vector, any quartile
4511 * \result Atan(y,x), same argument/value range as standard math library.
4513 * \note This routine should provide correct results for all finite
4514 * non-zero or positive-zero arguments. However, negative zero arguments will
4515 * be treated as positive zero, which means the return value will deviate from
4516 * the standard math library atan2(y,x) for those cases. That should not be
4517 * of any concern in Gromacs, and in particular it will not affect calculations
4518 * of angles from vectors.
4520 static inline SimdDouble gmx_simdcall atan2SingleAccuracy(SimdDouble y, SimdDouble x)
4522 const SimdDouble pi(M_PI);
4523 const SimdDouble halfpi(M_PI / 2.0);
4524 SimdDouble xinv, p, aoffset;
4525 SimdDBool mask_xnz, mask_ynz, mask_xlt0, mask_ylt0;
4527 mask_xnz = x != setZero();
4528 mask_ynz = y != setZero();
4529 mask_xlt0 = (x < setZero());
4530 mask_ylt0 = (y < setZero());
4532 aoffset = selectByNotMask(halfpi, mask_xnz);
4533 aoffset = selectByMask(aoffset, mask_ynz);
4535 aoffset = blend(aoffset, pi, mask_xlt0);
4536 aoffset = blend(aoffset, -aoffset, mask_ylt0);
4538 xinv = maskzInvSingleAccuracy(x, mask_xnz);
4540 p = atanSingleAccuracy(p);
4546 /*! \brief Analytical PME force correction, double SIMD data, single accuracy.
4548 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4549 * \result Correction factor to coulomb force - see below for details.
4551 * This routine is meant to enable analytical evaluation of the
4552 * direct-space PME electrostatic force to avoid tables.
4554 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
4555 * are some problems evaluating that:
4557 * First, the error function is difficult (read: expensive) to
4558 * approxmiate accurately for intermediate to large arguments, and
4559 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
4560 * Second, we now try to avoid calculating potentials in Gromacs but
4561 * use forces directly.
4563 * We can simply things slight by noting that the PME part is really
4564 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
4566 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
4568 * The first term we already have from the inverse square root, so
4569 * that we can leave out of this routine.
4571 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
4572 * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
4573 * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
4576 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
4577 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
4578 * then only use even powers. This is another minor optimization, since
4579 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
4580 * the vector between the two atoms to get the vectorial force. The
4581 * fastest flops are the ones we can avoid calculating!
4583 * So, here's how it should be used:
4585 * 1. Calculate \f$r^2\f$.
4586 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
4587 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
4588 * 4. The return value is the expression:
4591 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
4594 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
4597 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
4600 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
4603 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
4606 * With a bit of math exercise you should be able to confirm that
4610 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
4613 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
4614 * and you have your force (divided by \f$r\f$). A final multiplication
4615 * with the vector connecting the two particles and you have your
4616 * vectorial force to add to the particles.
4618 * This approximation achieves an accuracy slightly lower than 1e-6; when
4619 * added to \f$1/r\f$ the error will be insignificant.
4622 static inline SimdDouble gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdDouble z2)
4624 const SimdDouble FN6(-1.7357322914161492954e-8);
4625 const SimdDouble FN5(1.4703624142580877519e-6);
4626 const SimdDouble FN4(-0.000053401640219807709149);
4627 const SimdDouble FN3(0.0010054721316683106153);
4628 const SimdDouble FN2(-0.019278317264888380590);
4629 const SimdDouble FN1(0.069670166153766424023);
4630 const SimdDouble FN0(-0.75225204789749321333);
4632 const SimdDouble FD4(0.0011193462567257629232);
4633 const SimdDouble FD3(0.014866955030185295499);
4634 const SimdDouble FD2(0.11583842382862377919);
4635 const SimdDouble FD1(0.50736591960530292870);
4636 const SimdDouble FD0(1.0);
4639 SimdDouble polyFN0, polyFN1, polyFD0, polyFD1;
4643 polyFD0 = fma(FD4, z4, FD2);
4644 polyFD1 = fma(FD3, z4, FD1);
4645 polyFD0 = fma(polyFD0, z4, FD0);
4646 polyFD0 = fma(polyFD1, z2, polyFD0);
4648 polyFD0 = invSingleAccuracy(polyFD0);
4650 polyFN0 = fma(FN6, z4, FN4);
4651 polyFN1 = fma(FN5, z4, FN3);
4652 polyFN0 = fma(polyFN0, z4, FN2);
4653 polyFN1 = fma(polyFN1, z4, FN1);
4654 polyFN0 = fma(polyFN0, z4, FN0);
4655 polyFN0 = fma(polyFN1, z2, polyFN0);
4657 return polyFN0 * polyFD0;
4661 /*! \brief Analytical PME potential correction, double SIMD data, single accuracy.
4663 * \param z2 \f$(r \beta)^2\f$ - see below for details.
4664 * \result Correction factor to coulomb potential - see below for details.
4666 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
4667 * as the input argument.
4669 * Here's how it should be used:
4671 * 1. Calculate \f$r^2\f$.
4672 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
4673 * 3. Evaluate this routine with z^2 as the argument.
4674 * 4. The return value is the expression:
4677 * \frac{\mbox{erf}(z)}{z}
4680 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
4683 * \frac{\mbox{erf}(r \beta)}{r}
4686 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
4687 * and you have your potential.
4689 * This approximation achieves an accuracy slightly lower than 1e-6; when
4690 * added to \f$1/r\f$ the error will be insignificant.
4692 static inline SimdDouble gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
4694 const SimdDouble VN6(1.9296833005951166339e-8);
4695 const SimdDouble VN5(-1.4213390571557850962e-6);
4696 const SimdDouble VN4(0.000041603292906656984871);
4697 const SimdDouble VN3(-0.00013134036773265025626);
4698 const SimdDouble VN2(0.038657983986041781264);
4699 const SimdDouble VN1(0.11285044772717598220);
4700 const SimdDouble VN0(1.1283802385263030286);
4702 const SimdDouble VD3(0.0066752224023576045451);
4703 const SimdDouble VD2(0.078647795836373922256);
4704 const SimdDouble VD1(0.43336185284710920150);
4705 const SimdDouble VD0(1.0);
4708 SimdDouble polyVN0, polyVN1, polyVD0, polyVD1;
4712 polyVD1 = fma(VD3, z4, VD1);
4713 polyVD0 = fma(VD2, z4, VD0);
4714 polyVD0 = fma(polyVD1, z2, polyVD0);
4716 polyVD0 = invSingleAccuracy(polyVD0);
4718 polyVN0 = fma(VN6, z4, VN4);
4719 polyVN1 = fma(VN5, z4, VN3);
4720 polyVN0 = fma(polyVN0, z4, VN2);
4721 polyVN1 = fma(polyVN1, z4, VN1);
4722 polyVN0 = fma(polyVN0, z4, VN0);
4723 polyVN0 = fma(polyVN1, z2, polyVN0);
4725 return polyVN0 * polyVD0;
4731 /*! \name SIMD4 math functions
4733 * \note Only a subset of the math functions are implemented for SIMD4.
4738 # if GMX_SIMD4_HAVE_FLOAT
4740 /*************************************************************************
4741 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4742 *************************************************************************/
4744 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
4746 * This is a low-level routine that should only be used by SIMD math routine
4747 * that evaluates the inverse square root.
4749 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4750 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4751 * \return An improved approximation with roughly twice as many bits of accuracy.
4753 static inline Simd4Float gmx_simdcall rsqrtIter(Simd4Float lu, Simd4Float x)
4755 Simd4Float tmp1 = x * lu;
4756 Simd4Float tmp2 = Simd4Float(-0.5F) * lu;
4757 tmp1 = fma(tmp1, lu, Simd4Float(-3.0F));
4761 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
4763 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4764 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4765 * For the single precision implementation this is obviously always
4766 * true for positive values, but for double precision it adds an
4767 * extra restriction since the first lookup step might have to be
4768 * performed in single precision on some architectures. Note that the
4769 * responsibility for checking falls on you - this routine does not
4771 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4773 static inline Simd4Float gmx_simdcall invsqrt(Simd4Float x)
4775 Simd4Float lu = rsqrt(x);
4776 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4777 lu = rsqrtIter(lu, x);
4779 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4780 lu = rsqrtIter(lu, x);
4782 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4783 lu = rsqrtIter(lu, x);
4789 # endif // GMX_SIMD4_HAVE_FLOAT
4792 # if GMX_SIMD4_HAVE_DOUBLE
4793 /*************************************************************************
4794 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
4795 *************************************************************************/
4797 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
4799 * This is a low-level routine that should only be used by SIMD math routine
4800 * that evaluates the inverse square root.
4802 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
4803 * \param x The reference (starting) value x for which we want 1/sqrt(x).
4804 * \return An improved approximation with roughly twice as many bits of accuracy.
4806 static inline Simd4Double gmx_simdcall rsqrtIter(Simd4Double lu, Simd4Double x)
4808 Simd4Double tmp1 = x * lu;
4809 Simd4Double tmp2 = Simd4Double(-0.5F) * lu;
4810 tmp1 = fma(tmp1, lu, Simd4Double(-3.0F));
4814 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
4816 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4817 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4818 * For the single precision implementation this is obviously always
4819 * true for positive values, but for double precision it adds an
4820 * extra restriction since the first lookup step might have to be
4821 * performed in single precision on some architectures. Note that the
4822 * responsibility for checking falls on you - this routine does not
4824 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4826 static inline Simd4Double gmx_simdcall invsqrt(Simd4Double x)
4828 Simd4Double lu = rsqrt(x);
4829 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4830 lu = rsqrtIter(lu, x);
4832 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4833 lu = rsqrtIter(lu, x);
4835 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4836 lu = rsqrtIter(lu, x);
4838 # if (GMX_SIMD_RSQRT_BITS * 8 < GMX_SIMD_ACCURACY_BITS_DOUBLE)
4839 lu = rsqrtIter(lu, x);
4845 /**********************************************************************
4846 * SIMD4 MATH FUNCTIONS WITH DOUBLE PREC. DATA, SINGLE PREC. ACCURACY *
4847 **********************************************************************/
4849 /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
4851 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4852 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4853 * For the single precision implementation this is obviously always
4854 * true for positive values, but for double precision it adds an
4855 * extra restriction since the first lookup step might have to be
4856 * performed in single precision on some architectures. Note that the
4857 * responsibility for checking falls on you - this routine does not
4859 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4861 static inline Simd4Double gmx_simdcall invsqrtSingleAccuracy(Simd4Double x)
4863 Simd4Double lu = rsqrt(x);
4864 # if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_ACCURACY_BITS_SINGLE)
4865 lu = rsqrtIter(lu, x);
4867 # if (GMX_SIMD_RSQRT_BITS * 2 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4868 lu = rsqrtIter(lu, x);
4870 # if (GMX_SIMD_RSQRT_BITS * 4 < GMX_SIMD_ACCURACY_BITS_SINGLE)
4871 lu = rsqrtIter(lu, x);
4877 # endif // GMX_SIMD4_HAVE_DOUBLE
4881 # if GMX_SIMD_HAVE_FLOAT
4882 /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
4884 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4885 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4886 * For the single precision implementation this is obviously always
4887 * true for positive values, but for double precision it adds an
4888 * extra restriction since the first lookup step might have to be
4889 * performed in single precision on some architectures. Note that the
4890 * responsibility for checking falls on you - this routine does not
4892 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
4894 static inline SimdFloat gmx_simdcall invsqrtSingleAccuracy(SimdFloat x)
4899 /*! \brief Calculate 1/sqrt(x) for masked SIMD floats, only targeting single accuracy.
4901 * This routine only evaluates 1/sqrt(x) for elements for which mask is true.
4902 * Illegal values in the masked-out elements will not lead to
4903 * floating-point exceptions.
4905 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
4906 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4907 * For the single precision implementation this is obviously always
4908 * true for positive values, but for double precision it adds an
4909 * extra restriction since the first lookup step might have to be
4910 * performed in single precision on some architectures. Note that the
4911 * responsibility for checking falls on you - this routine does not
4914 * \return 1/sqrt(x). Result is undefined if your argument was invalid or
4915 * entry was not masked, and 0.0 for masked-out entries.
4917 static inline SimdFloat maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
4919 return maskzInvsqrt(x, m);
4922 /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
4924 * \param x0 First set of arguments, x0 must be in single range (see below).
4925 * \param x1 Second set of arguments, x1 must be in single range (see below).
4926 * \param[out] out0 Result 1/sqrt(x0)
4927 * \param[out] out1 Result 1/sqrt(x1)
4929 * In particular for double precision we can sometimes calculate square root
4930 * pairs slightly faster by using single precision until the very last step.
4932 * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
4933 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4934 * For the single precision implementation this is obviously always
4935 * true for positive values, but for double precision it adds an
4936 * extra restriction since the first lookup step might have to be
4937 * performed in single precision on some architectures. Note that the
4938 * responsibility for checking falls on you - this routine does not
4941 static inline void gmx_simdcall invsqrtPairSingleAccuracy(SimdFloat x0, SimdFloat x1, SimdFloat* out0, SimdFloat* out1)
4943 return invsqrtPair(x0, x1, out0, out1);
4946 /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
4948 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4949 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4950 * For the single precision implementation this is obviously always
4951 * true for positive values, but for double precision it adds an
4952 * extra restriction since the first lookup step might have to be
4953 * performed in single precision on some architectures. Note that the
4954 * responsibility for checking falls on you - this routine does not
4956 * \return 1/x. Result is undefined if your argument was invalid.
4958 static inline SimdFloat gmx_simdcall invSingleAccuracy(SimdFloat x)
4964 /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
4966 * \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
4967 * GMX_FLOAT_MAX, i.e. within the range of single precision.
4968 * For the single precision implementation this is obviously always
4969 * true for positive values, but for double precision it adds an
4970 * extra restriction since the first lookup step might have to be
4971 * performed in single precision on some architectures. Note that the
4972 * responsibility for checking falls on you - this routine does not
4975 * \return 1/x for elements where m is true, or 0.0 for masked-out entries.
4977 static inline SimdFloat maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
4979 return maskzInv(x, m);
4982 /*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
4984 * \copydetails sqrt(SimdFloat)
4986 template<MathOptimization opt = MathOptimization::Safe>
4987 static inline SimdFloat gmx_simdcall sqrtSingleAccuracy(SimdFloat x)
4989 return sqrt<opt>(x);
4992 /*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
4994 * \copydetails cbrt(SimdFloat)
4996 static inline SimdFloat gmx_simdcall cbrtSingleAccuracy(SimdFloat x)
5001 /*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
5003 * \copydetails cbrt(SimdFloat)
5005 static inline SimdFloat gmx_simdcall invcbrtSingleAccuracy(SimdFloat x)
5010 /*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
5012 * \param x Argument, should be >0.
5013 * \result The base-2 logarithm of x. Undefined if argument is invalid.
5015 static inline SimdFloat gmx_simdcall log2SingleAccuracy(SimdFloat x)
5020 /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
5022 * \param x Argument, should be >0.
5023 * \result The natural logarithm of x. Undefined if argument is invalid.
5025 static inline SimdFloat gmx_simdcall logSingleAccuracy(SimdFloat x)
5030 /*! \brief SIMD float 2^x, only targeting single accuracy.
5032 * \copydetails exp2(SimdFloat)
5034 template<MathOptimization opt = MathOptimization::Safe>
5035 static inline SimdFloat gmx_simdcall exp2SingleAccuracy(SimdFloat x)
5037 return exp2<opt>(x);
5040 /*! \brief SIMD float e^x, only targeting single accuracy.
5042 * \copydetails exp(SimdFloat)
5044 template<MathOptimization opt = MathOptimization::Safe>
5045 static inline SimdFloat gmx_simdcall expSingleAccuracy(SimdFloat x)
5050 /*! \brief SIMD pow(x,y), only targeting single accuracy.
5052 * \copydetails pow(SimdFloat)
5054 template<MathOptimization opt = MathOptimization::Safe>
5055 static inline SimdFloat gmx_simdcall powSingleAccuracy(SimdFloat x, SimdFloat y)
5057 return pow<opt>(x, y);
5060 /*! \brief SIMD float erf(x), only targeting single accuracy.
5062 * \param x The value to calculate erf(x) for.
5065 * This routine achieves very close to single precision, but we do not care about
5066 * the last bit or the subnormal result range.
5068 static inline SimdFloat gmx_simdcall erfSingleAccuracy(SimdFloat x)
5073 /*! \brief SIMD float erfc(x), only targeting single accuracy.
5075 * \param x The value to calculate erfc(x) for.
5078 * This routine achieves singleprecision (bar the last bit) over most of the
5079 * input range, but for large arguments where the result is getting close
5080 * to the minimum representable numbers we accept slightly larger errors
5081 * (think results that are in the ballpark of 10^-30) since that is not
5084 static inline SimdFloat gmx_simdcall erfcSingleAccuracy(SimdFloat x)
5089 /*! \brief SIMD float sin \& cos, only targeting single accuracy.
5091 * \param x The argument to evaluate sin/cos for
5092 * \param[out] sinval Sin(x)
5093 * \param[out] cosval Cos(x)
5095 static inline void gmx_simdcall sinCosSingleAccuracy(SimdFloat x, SimdFloat* sinval, SimdFloat* cosval)
5097 sincos(x, sinval, cosval);
5100 /*! \brief SIMD float sin(x), only targeting single accuracy.
5102 * \param x The argument to evaluate sin for
5105 * \attention Do NOT call both sin & cos if you need both results, since each of them
5106 * will then call \ref sincos and waste a factor 2 in performance.
5108 static inline SimdFloat gmx_simdcall sinSingleAccuracy(SimdFloat x)
5113 /*! \brief SIMD float cos(x), only targeting single accuracy.
5115 * \param x The argument to evaluate cos for
5118 * \attention Do NOT call both sin & cos if you need both results, since each of them
5119 * will then call \ref sincos and waste a factor 2 in performance.
5121 static inline SimdFloat gmx_simdcall cosSingleAccuracy(SimdFloat x)
5126 /*! \brief SIMD float tan(x), only targeting single accuracy.
5128 * \param x The argument to evaluate tan for
5131 static inline SimdFloat gmx_simdcall tanSingleAccuracy(SimdFloat x)
5136 /*! \brief SIMD float asin(x), only targeting single accuracy.
5138 * \param x The argument to evaluate asin for
5141 static inline SimdFloat gmx_simdcall asinSingleAccuracy(SimdFloat x)
5146 /*! \brief SIMD float acos(x), only targeting single accuracy.
5148 * \param x The argument to evaluate acos for
5151 static inline SimdFloat gmx_simdcall acosSingleAccuracy(SimdFloat x)
5156 /*! \brief SIMD float atan(x), only targeting single accuracy.
5158 * \param x The argument to evaluate atan for
5159 * \result Atan(x), same argument/value range as standard math library.
5161 static inline SimdFloat gmx_simdcall atanSingleAccuracy(SimdFloat x)
5166 /*! \brief SIMD float atan2(y,x), only targeting single accuracy.
5168 * \param y Y component of vector, any quartile
5169 * \param x X component of vector, any quartile
5170 * \result Atan(y,x), same argument/value range as standard math library.
5172 * \note This routine should provide correct results for all finite
5173 * non-zero or positive-zero arguments. However, negative zero arguments will
5174 * be treated as positive zero, which means the return value will deviate from
5175 * the standard math library atan2(y,x) for those cases. That should not be
5176 * of any concern in Gromacs, and in particular it will not affect calculations
5177 * of angles from vectors.
5179 static inline SimdFloat gmx_simdcall atan2SingleAccuracy(SimdFloat y, SimdFloat x)
5184 /*! \brief SIMD Analytic PME force correction, only targeting single accuracy.
5186 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5187 * \result Correction factor to coulomb force.
5189 static inline SimdFloat gmx_simdcall pmeForceCorrectionSingleAccuracy(SimdFloat z2)
5191 return pmeForceCorrection(z2);
5194 /*! \brief SIMD Analytic PME potential correction, only targeting single accuracy.
5196 * \param z2 \f$(r \beta)^2\f$ - see default single precision version for details.
5197 * \result Correction factor to coulomb force.
5199 static inline SimdFloat gmx_simdcall pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
5201 return pmePotentialCorrection(z2);
5203 # endif // GMX_SIMD_HAVE_FLOAT
5205 # if GMX_SIMD4_HAVE_FLOAT
5206 /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
5208 * \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
5209 * GMX_FLOAT_MAX, i.e. within the range of single precision.
5210 * For the single precision implementation this is obviously always
5211 * true for positive values, but for double precision it adds an
5212 * extra restriction since the first lookup step might have to be
5213 * performed in single precision on some architectures. Note that the
5214 * responsibility for checking falls on you - this routine does not
5216 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
5218 static inline Simd4Float gmx_simdcall invsqrtSingleAccuracy(Simd4Float x)
5222 # endif // GMX_SIMD4_HAVE_FLOAT
5224 /*! \} end of addtogroup module_simd */
5225 /*! \endcond end of condition libabl */
5231 #endif // GMX_SIMD_SIMD_MATH_H