2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SIMD_MATH_H_
36 #define GMX_SIMD_SIMD_MATH_H_
38 /*! \libinternal \file
40 * \brief Math functions for SIMD datatypes.
42 * \attention This file is generic for all SIMD architectures, so you cannot
43 * assume that any of the optional SIMD features (as defined in simd.h) are
44 * present. In particular, this means you cannot assume support for integers,
45 * logical operations (neither on floating-point nor integer values), shifts,
46 * and the architecture might only have SIMD for either float or double.
47 * Second, to keep this file clean and general, any additions to this file
48 * must work for all possible SIMD architectures in both single and double
49 * precision (if they support it), and you cannot make any assumptions about
52 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
55 * \ingroup module_simd
62 #include "gromacs/math/utilities.h"
63 #include "gromacs/simd/simd.h"
66 /*! \addtogroup module_simd */
69 /*! \name Implementation accuracy settings
73 /*! \brief We accept lsb errors for 1/sqrt(x) and 1/x, so float target is 22 bits */
74 #define GMX_SIMD_MATH_TARGET_SINGLE_BITS 22
76 /*! \brief We accept "double" that has 2x single precision - 44 bits.
78 * This way two Newton-Raphson iterations will suffice in double precision.
80 #define GMX_SIMD_MATH_TARGET_DOUBLE_BITS 44
84 #ifdef GMX_SIMD_HAVE_FLOAT
86 /*! \name Single precision SIMD math functions
88 * \note In most cases you should use the real-precision functions instead.
92 /****************************************
93 * SINGLE PRECISION SIMD MATH FUNCTIONS *
94 ****************************************/
96 /*! \brief SIMD float utility to sum a+b+c+d.
98 * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
100 * \param a term 1 (multiple values)
101 * \param b term 2 (multiple values)
102 * \param c term 3 (multiple values)
103 * \param d term 4 (multiple values)
104 * \return sum of terms 1-4 (multiple values)
106 static gmx_inline gmx_simd_float_t gmx_simdcall
107 gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
108 gmx_simd_float_t c, gmx_simd_float_t d)
110 return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
113 /*! \brief Return -a if b is negative, SIMD float.
115 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
117 * \param a Values to set sign for
118 * \param b Values used to set sign
119 * \return if b is negative, the sign of a will be changed.
121 * This is equivalent to doing an xor operation on a with the sign bit of b,
122 * with the exception that negative zero is not considered to be negative
123 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
125 static gmx_inline gmx_simd_float_t gmx_simdcall
126 gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
128 #ifdef GMX_SIMD_HAVE_LOGICAL
129 return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), b));
131 return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
135 #ifndef gmx_simd_rsqrt_iter_f
136 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
138 * This is a low-level routine that should only be used by SIMD math routine
139 * that evaluates the inverse square root.
141 * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
142 * \param x The reference (starting) value x for which we want 1/sqrt(x).
143 * \return An improved approximation with roughly twice as many bits of accuracy.
145 static gmx_inline gmx_simd_float_t gmx_simdcall
146 gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
148 # ifdef GMX_SIMD_HAVE_FMA
149 return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
151 return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
156 /*! \brief Calculate 1/sqrt(x) for SIMD float.
158 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
160 * \param x Argument that must be >0. This routine does not check arguments.
161 * \return 1/sqrt(x). Result is undefined if your argument was invalid.
163 static gmx_inline gmx_simd_float_t gmx_simdcall
164 gmx_simd_invsqrt_f(gmx_simd_float_t x)
166 gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
167 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
168 lu = gmx_simd_rsqrt_iter_f(lu, x);
170 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
171 lu = gmx_simd_rsqrt_iter_f(lu, x);
173 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
174 lu = gmx_simd_rsqrt_iter_f(lu, x);
179 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD float.
181 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for non-masked entries.
182 * The result for the non-masked entries is undefined and the user has to use blend
183 * with the same mask to obtain a defined result.
185 * \param x Argument that must be >0 for masked entries
186 * \param m Masked entries
187 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was not masked.
190 #define gmx_simd_invsqrt_maskfpe_f(x, m) gmx_simd_invsqrt_f(x)
192 static gmx_inline gmx_simd_float_t
193 gmx_simd_invsqrt_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t m)
195 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
199 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD float.
201 * Identical to gmx_simd_invsqrt_f but avoids fp-exception for masked entries.
202 * The result for the non-masked entries is undefined and the user has to use blend
203 * with the same mask to obtain a defined result.
205 * \param x Argument that must be >0 for non-masked entries
206 * \param m Masked entries
207 * \return 1/sqrt(x). Result is undefined if your argument was invalid or entry was masked.
210 #define gmx_simd_invsqrt_notmaskfpe_f(x, m) gmx_simd_invsqrt_f(x)
212 static gmx_inline gmx_simd_float_t
213 gmx_simd_invsqrt_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t m)
215 return gmx_simd_invsqrt_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
219 /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
221 * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
223 * \param x0 First set of arguments, x0 must be positive - no argument checking.
224 * \param x1 Second set of arguments, x1 must be positive - no argument checking.
225 * \param[out] out0 Result 1/sqrt(x0)
226 * \param[out] out1 Result 1/sqrt(x1)
228 * In particular for double precision we can sometimes calculate square root
229 * pairs slightly faster by using single precision until the very last step.
231 static gmx_inline void gmx_simdcall
232 gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
233 gmx_simd_float_t *out0, gmx_simd_float_t *out1)
235 *out0 = gmx_simd_invsqrt_f(x0);
236 *out1 = gmx_simd_invsqrt_f(x1);
239 #ifndef gmx_simd_rcp_iter_f
240 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
242 * This is a low-level routine that should only be used by SIMD math routine
243 * that evaluates the reciprocal.
245 * \param lu Approximation of 1/x, typically obtained from lookup.
246 * \param x The reference (starting) value x for which we want 1/x.
247 * \return An improved approximation with roughly twice as many bits of accuracy.
249 static gmx_inline gmx_simd_float_t gmx_simdcall
250 gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
252 return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
256 /*! \brief Calculate 1/x for SIMD float.
258 * You should normally call the real-precision routine \ref gmx_simd_inv_r.
260 * \param x Argument that must be nonzero. This routine does not check arguments.
261 * \return 1/x. Result is undefined if your argument was invalid.
263 static gmx_inline gmx_simd_float_t gmx_simdcall
264 gmx_simd_inv_f(gmx_simd_float_t x)
266 gmx_simd_float_t lu = gmx_simd_rcp_f(x);
267 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
268 lu = gmx_simd_rcp_iter_f(lu, x);
270 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
271 lu = gmx_simd_rcp_iter_f(lu, x);
273 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
274 lu = gmx_simd_rcp_iter_f(lu, x);
279 /*! \brief Calculate 1/x for masked entries of SIMD float.
281 * Identical to gmx_simd_inv_f but avoids fp-exception for non-masked entries.
282 * The result for the non-masked entries is undefined and the user has to use blend
283 * with the same mask to obtain a defined result.
285 * \param x Argument that must be nonzero for masked entries
286 * \param m Masked entries
287 * \return 1/x. Result is undefined if your argument was invalid or entry was not masked.
290 #define gmx_simd_inv_maskfpe_f(x, m) gmx_simd_inv_f(x)
292 static gmx_inline gmx_simd_float_t
293 gmx_simd_inv_maskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t m)
295 return gmx_simd_inv_f(gmx_simd_blendv_f(gmx_simd_set1_f(1.0f), x, m));
299 /*! \brief Calculate 1/x for non-masked entries of SIMD float.
301 * Identical to gmx_simd_inv_f but avoids fp-exception for masked entries.
302 * The result for the non-masked entries is undefined and the user has to use blend
303 * with the same mask to obtain a defined result.
305 * \param x Argument that must be nonzero for non-masked entries
306 * \param m Masked entries
307 * \return 1/x. Result is undefined if your argument was invalid or entry was masked.
310 #define gmx_simd_inv_notmaskfpe_f(x, m) gmx_simd_inv_f(x)
312 static gmx_inline gmx_simd_float_t
313 gmx_simd_inv_notmaskfpe_f(gmx_simd_float_t x, gmx_simd_fbool_t m)
315 return gmx_simd_inv_f(gmx_simd_blendv_f(x, gmx_simd_set1_f(1.0f), m));
319 /*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
321 * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
323 * \param x Argument that must be >=0.
324 * \return sqrt(x). If x=0, the result will correctly be set to 0.
325 * The result is undefined if the input value is negative.
327 static gmx_inline gmx_simd_float_t gmx_simdcall
328 gmx_simd_sqrt_f(gmx_simd_float_t x)
330 gmx_simd_fbool_t mask;
331 gmx_simd_float_t res;
333 mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
334 res = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_notmaskfpe_f(x, mask), mask);
335 return gmx_simd_mul_f(res, x);
338 /*! \brief SIMD float log(x). This is the natural logarithm.
340 * You should normally call the real-precision routine \ref gmx_simd_log_r.
342 * \param x Argument, should be >0.
343 * \result The natural logarithm of x. Undefined if argument is invalid.
345 #ifndef gmx_simd_log_f
346 static gmx_inline gmx_simd_float_t gmx_simdcall
347 gmx_simd_log_f(gmx_simd_float_t x)
349 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
350 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
351 const gmx_simd_float_t sqrt2 = gmx_simd_set1_f(sqrt(2.0f));
352 const gmx_simd_float_t corr = gmx_simd_set1_f(0.693147180559945286226764f);
353 const gmx_simd_float_t CL9 = gmx_simd_set1_f(0.2371599674224853515625f);
354 const gmx_simd_float_t CL7 = gmx_simd_set1_f(0.285279005765914916992188f);
355 const gmx_simd_float_t CL5 = gmx_simd_set1_f(0.400005519390106201171875f);
356 const gmx_simd_float_t CL3 = gmx_simd_set1_f(0.666666567325592041015625f);
357 const gmx_simd_float_t CL1 = gmx_simd_set1_f(2.0f);
358 gmx_simd_float_t fexp, x2, p;
359 gmx_simd_fbool_t mask;
361 fexp = gmx_simd_get_exponent_f(x);
362 x = gmx_simd_get_mantissa_f(x);
364 mask = gmx_simd_cmplt_f(sqrt2, x);
365 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
366 fexp = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
367 x = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
369 x = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
370 x2 = gmx_simd_mul_f(x, x);
372 p = gmx_simd_fmadd_f(CL9, x2, CL7);
373 p = gmx_simd_fmadd_f(p, x2, CL5);
374 p = gmx_simd_fmadd_f(p, x2, CL3);
375 p = gmx_simd_fmadd_f(p, x2, CL1);
376 p = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
382 #ifndef gmx_simd_exp2_f
383 /*! \brief SIMD float 2^x.
385 * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
388 * \result 2^x. Undefined if input argument caused overflow.
390 static gmx_inline gmx_simd_float_t gmx_simdcall
391 gmx_simd_exp2_f(gmx_simd_float_t x)
393 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
394 const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
395 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.0001534581200287996416911311);
396 const gmx_simd_float_t CC5 = gmx_simd_set1_f(0.001339993121934088894618990);
397 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.009618488957115180159497841);
398 const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.05550328776964726865751735);
399 const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.2402264689063408646490722);
400 const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.6931472057372680777553816);
401 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
403 gmx_simd_float_t fexppart;
404 gmx_simd_float_t intpart;
406 gmx_simd_fbool_t valuemask;
408 fexppart = gmx_simd_set_exponent_f(x);
409 intpart = gmx_simd_round_f(x);
410 valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
411 fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
412 x = gmx_simd_sub_f(x, intpart);
414 p = gmx_simd_fmadd_f(CC6, x, CC5);
415 p = gmx_simd_fmadd_f(p, x, CC4);
416 p = gmx_simd_fmadd_f(p, x, CC3);
417 p = gmx_simd_fmadd_f(p, x, CC2);
418 p = gmx_simd_fmadd_f(p, x, CC1);
419 p = gmx_simd_fmadd_f(p, x, one);
420 x = gmx_simd_mul_f(p, fexppart);
425 #ifndef gmx_simd_exp_f
426 /*! \brief SIMD float exp(x).
428 * You should normally call the real-precision routine \ref gmx_simd_exp_r.
430 * In addition to scaling the argument for 2^x this routine correctly does
431 * extended precision arithmetics to improve accuracy.
434 * \result exp(x). Undefined if input argument caused overflow,
435 * which can happen if abs(x) \> 7e13.
437 static gmx_inline gmx_simd_float_t gmx_simdcall
438 gmx_simd_exp_f(gmx_simd_float_t x)
440 const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
441 /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
442 const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
443 const gmx_simd_float_t invargscale0 = gmx_simd_set1_f(-0.693145751953125f);
444 const gmx_simd_float_t invargscale1 = gmx_simd_set1_f(-1.428606765330187045e-06f);
445 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.00136324646882712841033936f);
446 const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.00836596917361021041870117f);
447 const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.0416710823774337768554688f);
448 const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.166665524244308471679688f);
449 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.499999850988388061523438f);
450 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
451 gmx_simd_float_t fexppart;
452 gmx_simd_float_t intpart;
453 gmx_simd_float_t y, p;
454 gmx_simd_fbool_t valuemask;
456 y = gmx_simd_mul_f(x, argscale);
457 fexppart = gmx_simd_set_exponent_f(y); /* rounds to nearest int internally */
458 intpart = gmx_simd_round_f(y); /* use same rounding algorithm here */
459 valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
460 fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
462 /* Extended precision arithmetics */
463 x = gmx_simd_fmadd_f(invargscale0, intpart, x);
464 x = gmx_simd_fmadd_f(invargscale1, intpart, x);
466 p = gmx_simd_fmadd_f(CC4, x, CC3);
467 p = gmx_simd_fmadd_f(p, x, CC2);
468 p = gmx_simd_fmadd_f(p, x, CC1);
469 p = gmx_simd_fmadd_f(p, x, CC0);
470 p = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
471 p = gmx_simd_add_f(p, one);
472 x = gmx_simd_mul_f(p, fexppart);
477 /*! \brief SIMD float erf(x).
479 * You should normally call the real-precision routine \ref gmx_simd_erf_r.
481 * \param x The value to calculate erf(x) for.
484 * This routine achieves very close to full precision, but we do not care about
485 * the last bit or the subnormal result range.
487 static gmx_inline gmx_simd_float_t gmx_simdcall
488 gmx_simd_erf_f(gmx_simd_float_t x)
490 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
491 const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
492 const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
493 const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
494 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
495 const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
496 const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
497 const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
498 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
499 const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
500 const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
501 const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
502 const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
503 const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
504 const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
505 const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
506 const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
507 const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
508 const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
509 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
510 const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
511 const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
512 const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
513 const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
514 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
515 const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
516 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
517 const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
518 const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
519 const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
520 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
521 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
522 const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
524 gmx_simd_float_t x2, x4, y;
525 gmx_simd_float_t t, t2, w, w2;
526 gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
527 gmx_simd_float_t expmx2;
528 gmx_simd_float_t res_erf, res_erfc, res;
529 gmx_simd_fbool_t mask, msk_erf;
531 /* Calculate erf() */
532 x2 = gmx_simd_mul_f(x, x);
533 x4 = gmx_simd_mul_f(x2, x2);
535 pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
536 pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
537 pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
538 pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
539 pA0 = gmx_simd_mul_f(pA0, x4);
540 pA0 = gmx_simd_fmadd_f(pA1, x2, pA0);
541 /* Constant term must come last for precision reasons */
542 pA0 = gmx_simd_add_f(pA0, CA0);
544 res_erf = gmx_simd_mul_f(x, pA0);
547 y = gmx_simd_fabs_f(x);
548 msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
549 t = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
550 w = gmx_simd_sub_f(t, one);
551 t2 = gmx_simd_mul_f(t, t);
552 w2 = gmx_simd_mul_f(w, w);
554 /* No need for a floating-point sieve here (as in erfc), since erf()
555 * will never return values that are extremely small for large args.
557 expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
559 pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
560 pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
561 pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
562 pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
563 pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
564 pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
565 pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
566 pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
567 pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
569 pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
570 pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
571 pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
572 pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
573 pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
574 pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
575 pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
576 pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
578 pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
579 pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
580 pC0 = gmx_simd_mul_f(pC0, t);
582 /* SELECT pB0 or pC0 for erfc() */
583 mask = gmx_simd_cmplt_f(two, y);
584 res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
585 res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
587 /* erfc(x<0) = 2-erfc(|x|) */
588 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
589 res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
591 /* Select erf() or erfc() */
592 res = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, msk_erf);
597 /*! \brief SIMD float erfc(x).
599 * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
601 * \param x The value to calculate erfc(x) for.
604 * This routine achieves full precision (bar the last bit) over most of the
605 * input range, but for large arguments where the result is getting close
606 * to the minimum representable numbers we accept slightly larger errors
607 * (think results that are in the ballpark of 10^-30 for single precision,
608 * or 10^-200 for double) since that is not relevant for MD.
610 static gmx_inline gmx_simd_float_t gmx_simdcall
611 gmx_simd_erfc_f(gmx_simd_float_t x)
613 /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
614 const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
615 const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
616 const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
617 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
618 const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
619 const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
620 const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
621 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
622 const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
623 const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
624 const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
625 const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
626 const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
627 const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
628 const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
629 const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
630 const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
631 const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
632 /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
633 const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
634 const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
635 const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
636 const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
637 const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
638 const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
639 const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
640 const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
641 const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
642 const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
643 const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
644 /* Coefficients for expansion of exp(x) in [0,0.1] */
645 /* CD0 and CD1 are both 1.0, so no need to declare them separately */
646 const gmx_simd_float_t CD2 = gmx_simd_set1_f(0.5000066608081202f);
647 const gmx_simd_float_t CD3 = gmx_simd_set1_f(0.1664795422874624f);
648 const gmx_simd_float_t CD4 = gmx_simd_set1_f(0.04379839977652482f);
649 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
650 const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
652 /* We need to use a small trick here, since we cannot assume all SIMD
653 * architectures support integers, and the flag we want (0xfffff000) would
654 * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
655 * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
656 * fp numbers, and perform a logical or. Since the expression is constant,
657 * we can at least hope it is evaluated at compile-time.
659 #ifdef GMX_SIMD_HAVE_LOGICAL
660 const gmx_simd_float_t sieve = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
662 const int isieve = 0xFFFFF000;
663 float mem[GMX_SIMD_REAL_WIDTH*2];
664 float * pmem = gmx_simd_align_f(mem);
671 gmx_simd_float_t x2, x4, y;
672 gmx_simd_float_t q, z, t, t2, w, w2;
673 gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
674 gmx_simd_float_t expmx2, corr;
675 gmx_simd_float_t res_erf, res_erfc, res;
676 gmx_simd_fbool_t mask, msk_erf;
678 /* Calculate erf() */
679 x2 = gmx_simd_mul_f(x, x);
680 x4 = gmx_simd_mul_f(x2, x2);
682 pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
683 pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
684 pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
685 pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
686 pA1 = gmx_simd_mul_f(pA1, x2);
687 pA0 = gmx_simd_fmadd_f(pA0, x4, pA1);
688 /* Constant term must come last for precision reasons */
689 pA0 = gmx_simd_add_f(pA0, CA0);
691 res_erf = gmx_simd_mul_f(x, pA0);
694 y = gmx_simd_fabs_f(x);
695 msk_erf = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
696 t = gmx_simd_inv_notmaskfpe_f(y, msk_erf);
697 w = gmx_simd_sub_f(t, one);
698 t2 = gmx_simd_mul_f(t, t);
699 w2 = gmx_simd_mul_f(w, w);
701 * We cannot simply calculate exp(-y2) directly in single precision, since
702 * that will lose a couple of bits of precision due to the multiplication.
703 * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
704 * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
706 * The only drawback with this is that it requires TWO separate exponential
707 * evaluations, which would be horrible performance-wise. However, the argument
708 * for the second exp() call is always small, so there we simply use a
709 * low-order minimax expansion on [0,0.1].
711 * However, this neat idea requires support for logical ops (and) on
712 * FP numbers, which some vendors decided isn't necessary in their SIMD
713 * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
714 * in double, but we still need memory as a backup when that is not available,
715 * and this case is rare enough that we go directly there...
717 #ifdef GMX_SIMD_HAVE_LOGICAL
718 z = gmx_simd_and_f(y, sieve);
720 gmx_simd_store_f(pmem, y);
721 for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
724 conv.i = conv.i & isieve;
727 z = gmx_simd_load_f(pmem);
729 q = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
730 corr = gmx_simd_fmadd_f(CD4, q, CD3);
731 corr = gmx_simd_fmadd_f(corr, q, CD2);
732 corr = gmx_simd_fmadd_f(corr, q, one);
733 corr = gmx_simd_fmadd_f(corr, q, one);
735 expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
736 expmx2 = gmx_simd_mul_f(expmx2, corr);
738 pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
739 pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
740 pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
741 pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
742 pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
743 pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
744 pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
745 pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
746 pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
748 pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
749 pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
750 pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
751 pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
752 pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
753 pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
754 pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
755 pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
757 pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
758 pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
759 pC0 = gmx_simd_mul_f(pC0, t);
761 /* SELECT pB0 or pC0 for erfc() */
762 mask = gmx_simd_cmplt_f(two, y);
763 res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
764 res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
766 /* erfc(x<0) = 2-erfc(|x|) */
767 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
768 res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
770 /* Select erf() or erfc() */
771 res = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), msk_erf);
776 /*! \brief SIMD float sin \& cos.
778 * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
780 * \param x The argument to evaluate sin/cos for
781 * \param[out] sinval Sin(x)
782 * \param[out] cosval Cos(x)
784 * This version achieves close to machine precision, but for very large
785 * magnitudes of the argument we inherently begin to lose accuracy due to the
786 * argument reduction, despite using extended precision arithmetics internally.
788 static gmx_inline void gmx_simdcall
789 gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
791 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
792 const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125);
793 const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f);
794 const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f);
795 const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f);
796 const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
797 const gmx_simd_float_t const_sin2 = gmx_simd_set1_f(-1.9515295891e-4f);
798 const gmx_simd_float_t const_sin1 = gmx_simd_set1_f( 8.3321608736e-3f);
799 const gmx_simd_float_t const_sin0 = gmx_simd_set1_f(-1.6666654611e-1f);
800 const gmx_simd_float_t const_cos2 = gmx_simd_set1_f( 2.443315711809948e-5f);
801 const gmx_simd_float_t const_cos1 = gmx_simd_set1_f(-1.388731625493765e-3f);
802 const gmx_simd_float_t const_cos0 = gmx_simd_set1_f( 4.166664568298827e-2f);
803 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
804 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
805 gmx_simd_float_t ssign, csign;
806 gmx_simd_float_t x2, y, z, psin, pcos, sss, ccc;
807 gmx_simd_fbool_t mask;
808 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
809 const gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
810 const gmx_simd_fint32_t itwo = gmx_simd_set1_fi(2);
811 gmx_simd_fint32_t iy;
813 z = gmx_simd_mul_f(x, two_over_pi);
814 iy = gmx_simd_cvt_f2i(z);
815 y = gmx_simd_round_f(z);
817 mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
818 ssign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
819 csign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
821 const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
822 const gmx_simd_float_t minusquarter = gmx_simd_set1_f(-0.25f);
824 gmx_simd_fbool_t m1, m2, m3;
826 /* The most obvious way to find the arguments quadrant in the unit circle
827 * to calculate the sign is to use integer arithmetic, but that is not
828 * present in all SIMD implementations. As an alternative, we have devised a
829 * pure floating-point algorithm that uses truncation for argument reduction
830 * so that we get a new value 0<=q<1 over the unit circle, and then
831 * do floating-point comparisons with fractions. This is likely to be
832 * slightly slower (~10%) due to the longer latencies of floating-point, so
833 * we only use it when integer SIMD arithmetic is not present.
836 x = gmx_simd_fabs_f(x);
837 /* It is critical that half-way cases are rounded down */
838 z = gmx_simd_fmadd_f(x, two_over_pi, half);
839 y = gmx_simd_trunc_f(z);
840 q = gmx_simd_mul_f(z, quarter);
841 q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
842 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
843 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
844 * This removes the 2*Pi periodicity without using any integer arithmetic.
845 * First check if y had the value 2 or 3, set csign if true.
847 q = gmx_simd_sub_f(q, half);
848 /* If we have logical operations we can work directly on the signbit, which
849 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
850 * Thus, if you are altering defines to debug alternative code paths, the
851 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
852 * active or inactive - you will get errors if only one is used.
854 # ifdef GMX_SIMD_HAVE_LOGICAL
855 ssign = gmx_simd_and_f(ssign, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
856 csign = gmx_simd_andnot_f(q, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
857 ssign = gmx_simd_xor_f(ssign, csign);
859 csign = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
860 // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
862 ssign = gmx_simd_xor_sign_f(ssign, csign); /* swap ssign if csign was set. */
864 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
865 m1 = gmx_simd_cmplt_f(q, minusquarter);
866 m2 = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
867 m3 = gmx_simd_cmplt_f(q, quarter);
868 m2 = gmx_simd_and_fb(m2, m3);
869 mask = gmx_simd_or_fb(m1, m2);
870 /* where mask is FALSE, set sign. */
871 csign = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
873 x = gmx_simd_fmadd_f(y, argred0, x);
874 x = gmx_simd_fmadd_f(y, argred1, x);
875 x = gmx_simd_fmadd_f(y, argred2, x);
876 x = gmx_simd_fmadd_f(y, argred3, x);
877 x2 = gmx_simd_mul_f(x, x);
879 psin = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
880 psin = gmx_simd_fmadd_f(psin, x2, const_sin0);
881 psin = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
882 pcos = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
883 pcos = gmx_simd_fmadd_f(pcos, x2, const_cos0);
884 pcos = gmx_simd_fmsub_f(pcos, x2, half);
885 pcos = gmx_simd_fmadd_f(pcos, x2, one);
887 sss = gmx_simd_blendv_f(pcos, psin, mask);
888 ccc = gmx_simd_blendv_f(psin, pcos, mask);
889 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
890 #ifdef GMX_SIMD_HAVE_LOGICAL
891 *sinval = gmx_simd_xor_f(sss, ssign);
892 *cosval = gmx_simd_xor_f(ccc, csign);
894 *sinval = gmx_simd_xor_sign_f(sss, ssign);
895 *cosval = gmx_simd_xor_sign_f(ccc, csign);
899 /*! \brief SIMD float sin(x).
901 * You should normally call the real-precision routine \ref gmx_simd_sin_r.
903 * \param x The argument to evaluate sin for
906 * \attention Do NOT call both sin & cos if you need both results, since each of them
907 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
909 static gmx_inline gmx_simd_float_t gmx_simdcall
910 gmx_simd_sin_f(gmx_simd_float_t x)
912 gmx_simd_float_t s, c;
913 gmx_simd_sincos_f(x, &s, &c);
917 /*! \brief SIMD float cos(x).
919 * You should normally call the real-precision routine \ref gmx_simd_cos_r.
921 * \param x The argument to evaluate cos for
924 * \attention Do NOT call both sin & cos if you need both results, since each of them
925 * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
927 static gmx_inline gmx_simd_float_t gmx_simdcall
928 gmx_simd_cos_f(gmx_simd_float_t x)
930 gmx_simd_float_t s, c;
931 gmx_simd_sincos_f(x, &s, &c);
935 /*! \brief SIMD float tan(x).
937 * You should normally call the real-precision routine \ref gmx_simd_tan_r.
939 * \param x The argument to evaluate tan for
942 static gmx_inline gmx_simd_float_t gmx_simdcall
943 gmx_simd_tan_f(gmx_simd_float_t x)
945 const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125);
946 const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f);
947 const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f);
948 const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f);
949 const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
950 const gmx_simd_float_t CT6 = gmx_simd_set1_f(0.009498288995810566122993911);
951 const gmx_simd_float_t CT5 = gmx_simd_set1_f(0.002895755790837379295226923);
952 const gmx_simd_float_t CT4 = gmx_simd_set1_f(0.02460087336161924491836265);
953 const gmx_simd_float_t CT3 = gmx_simd_set1_f(0.05334912882656359828045988);
954 const gmx_simd_float_t CT2 = gmx_simd_set1_f(0.1333989091464957704418495);
955 const gmx_simd_float_t CT1 = gmx_simd_set1_f(0.3333307599244198227797507);
957 gmx_simd_float_t x2, p, y, z;
958 gmx_simd_fbool_t mask;
960 #if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
961 gmx_simd_fint32_t iy;
962 gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
964 z = gmx_simd_mul_f(x, two_over_pi);
965 iy = gmx_simd_cvt_f2i(z);
966 y = gmx_simd_round_f(z);
967 mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
969 x = gmx_simd_fmadd_f(y, argred0, x);
970 x = gmx_simd_fmadd_f(y, argred1, x);
971 x = gmx_simd_fmadd_f(y, argred2, x);
972 x = gmx_simd_fmadd_f(y, argred3, x);
973 x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
975 const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
976 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
977 const gmx_simd_float_t threequarter = gmx_simd_set1_f(0.75f);
978 gmx_simd_float_t w, q;
979 gmx_simd_fbool_t m1, m2, m3;
981 w = gmx_simd_fabs_f(x);
982 z = gmx_simd_fmadd_f(w, two_over_pi, half);
983 y = gmx_simd_trunc_f(z);
984 q = gmx_simd_mul_f(z, quarter);
985 q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
986 m1 = gmx_simd_cmple_f(quarter, q);
987 m2 = gmx_simd_cmplt_f(q, half);
988 m3 = gmx_simd_cmple_f(threequarter, q);
989 m1 = gmx_simd_and_fb(m1, m2);
990 mask = gmx_simd_or_fb(m1, m3);
991 w = gmx_simd_fmadd_f(y, argred0, w);
992 w = gmx_simd_fmadd_f(y, argred1, w);
993 w = gmx_simd_fmadd_f(y, argred2, w);
994 w = gmx_simd_fmadd_f(y, argred3, w);
996 w = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
997 x = gmx_simd_xor_sign_f(w, x);
999 x2 = gmx_simd_mul_f(x, x);
1000 p = gmx_simd_fmadd_f(CT6, x2, CT5);
1001 p = gmx_simd_fmadd_f(p, x2, CT4);
1002 p = gmx_simd_fmadd_f(p, x2, CT3);
1003 p = gmx_simd_fmadd_f(p, x2, CT2);
1004 p = gmx_simd_fmadd_f(p, x2, CT1);
1005 p = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
1007 p = gmx_simd_blendv_f( p, gmx_simd_inv_maskfpe_f(p, mask), mask);
1011 /*! \brief SIMD float asin(x).
1013 * You should normally call the real-precision routine \ref gmx_simd_asin_r.
1015 * \param x The argument to evaluate asin for
1018 static gmx_inline gmx_simd_float_t gmx_simdcall
1019 gmx_simd_asin_f(gmx_simd_float_t x)
1021 const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
1022 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
1023 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1024 const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
1025 const gmx_simd_float_t CC5 = gmx_simd_set1_f(4.2163199048E-2f);
1026 const gmx_simd_float_t CC4 = gmx_simd_set1_f(2.4181311049E-2f);
1027 const gmx_simd_float_t CC3 = gmx_simd_set1_f(4.5470025998E-2f);
1028 const gmx_simd_float_t CC2 = gmx_simd_set1_f(7.4953002686E-2f);
1029 const gmx_simd_float_t CC1 = gmx_simd_set1_f(1.6666752422E-1f);
1030 gmx_simd_float_t xabs;
1031 gmx_simd_float_t z, z1, z2, q, q1, q2;
1032 gmx_simd_float_t pA, pB;
1033 gmx_simd_fbool_t mask, mask2;
1035 xabs = gmx_simd_fabs_f(x);
1036 mask = gmx_simd_cmplt_f(half, xabs);
1037 z1 = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1038 mask2 = gmx_simd_cmpeq_f(xabs, one);
1039 q1 = gmx_simd_mul_f(z1, gmx_simd_invsqrt_notmaskfpe_f(z1, mask2));
1040 q1 = gmx_simd_blendnotzero_f(q1, mask2);
1042 z2 = gmx_simd_mul_f(q2, q2);
1043 z = gmx_simd_blendv_f(z2, z1, mask);
1044 q = gmx_simd_blendv_f(q2, q1, mask);
1046 z2 = gmx_simd_mul_f(z, z);
1047 pA = gmx_simd_fmadd_f(CC5, z2, CC3);
1048 pB = gmx_simd_fmadd_f(CC4, z2, CC2);
1049 pA = gmx_simd_fmadd_f(pA, z2, CC1);
1050 pA = gmx_simd_mul_f(pA, z);
1051 z = gmx_simd_fmadd_f(pB, z2, pA);
1052 z = gmx_simd_fmadd_f(z, q, q);
1053 q2 = gmx_simd_sub_f(halfpi, z);
1054 q2 = gmx_simd_sub_f(q2, z);
1055 z = gmx_simd_blendv_f(z, q2, mask);
1057 mask = gmx_simd_cmplt_f(limitlow, xabs);
1058 z = gmx_simd_blendv_f( xabs, z, mask );
1059 z = gmx_simd_xor_sign_f(z, x);
1064 /*! \brief SIMD float acos(x).
1066 * You should normally call the real-precision routine \ref gmx_simd_acos_r.
1068 * \param x The argument to evaluate acos for
1071 static gmx_inline gmx_simd_float_t gmx_simdcall
1072 gmx_simd_acos_f(gmx_simd_float_t x)
1074 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1075 const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
1076 const gmx_simd_float_t pi = gmx_simd_set1_f((float)M_PI);
1077 const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
1078 gmx_simd_float_t xabs;
1079 gmx_simd_float_t z, z1, z2, z3;
1080 gmx_simd_fbool_t mask1, mask2, mask3;
1082 xabs = gmx_simd_fabs_f(x);
1083 mask1 = gmx_simd_cmplt_f(half, xabs);
1084 mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
1086 z = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
1087 mask3 = gmx_simd_cmpeq_f(xabs, one);
1088 z = gmx_simd_mul_f(z, gmx_simd_invsqrt_notmaskfpe_f(z, mask3));
1089 z = gmx_simd_blendnotzero_f(z, mask3);
1090 z = gmx_simd_blendv_f(x, z, mask1);
1091 z = gmx_simd_asin_f(z);
1093 z2 = gmx_simd_add_f(z, z);
1094 z1 = gmx_simd_sub_f(pi, z2);
1095 z3 = gmx_simd_sub_f(halfpi, z);
1096 z = gmx_simd_blendv_f(z1, z2, mask2);
1097 z = gmx_simd_blendv_f(z3, z, mask1);
1102 /*! \brief SIMD float asin(x).
1104 * You should normally call the real-precision routine \ref gmx_simd_atan_r.
1106 * \param x The argument to evaluate atan for
1107 * \result Atan(x), same argument/value range as standard math library.
1109 static gmx_inline gmx_simd_float_t gmx_simdcall
1110 gmx_simd_atan_f(gmx_simd_float_t x)
1112 const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
1113 const gmx_simd_float_t CA17 = gmx_simd_set1_f(0.002823638962581753730774f);
1114 const gmx_simd_float_t CA15 = gmx_simd_set1_f(-0.01595690287649631500244f);
1115 const gmx_simd_float_t CA13 = gmx_simd_set1_f(0.04250498861074447631836f);
1116 const gmx_simd_float_t CA11 = gmx_simd_set1_f(-0.07489009201526641845703f);
1117 const gmx_simd_float_t CA9 = gmx_simd_set1_f(0.1063479334115982055664f);
1118 const gmx_simd_float_t CA7 = gmx_simd_set1_f(-0.1420273631811141967773f);
1119 const gmx_simd_float_t CA5 = gmx_simd_set1_f(0.1999269574880599975585f);
1120 const gmx_simd_float_t CA3 = gmx_simd_set1_f(-0.3333310186862945556640f);
1121 const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
1122 gmx_simd_float_t x2, x3, x4, pA, pB;
1123 gmx_simd_fbool_t mask, mask2;
1125 mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1126 x = gmx_simd_fabs_f(x);
1127 mask2 = gmx_simd_cmplt_f(one, x);
1128 x = gmx_simd_blendv_f(x, gmx_simd_inv_maskfpe_f(x, mask2), mask2);
1130 x2 = gmx_simd_mul_f(x, x);
1131 x3 = gmx_simd_mul_f(x2, x);
1132 x4 = gmx_simd_mul_f(x2, x2);
1133 pA = gmx_simd_fmadd_f(CA17, x4, CA13);
1134 pB = gmx_simd_fmadd_f(CA15, x4, CA11);
1135 pA = gmx_simd_fmadd_f(pA, x4, CA9);
1136 pB = gmx_simd_fmadd_f(pB, x4, CA7);
1137 pA = gmx_simd_fmadd_f(pA, x4, CA5);
1138 pB = gmx_simd_fmadd_f(pB, x4, CA3);
1139 pA = gmx_simd_fmadd_f(pA, x2, pB);
1140 pA = gmx_simd_fmadd_f(pA, x3, x);
1142 pA = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
1143 pA = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
1148 /*! \brief SIMD float atan2(y,x).
1150 * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
1152 * \param y Y component of vector, any quartile
1153 * \param x X component of vector, any quartile
1154 * \result Atan(y,x), same argument/value range as standard math library.
1156 * \note This routine should provide correct results for all finite
1157 * non-zero or positive-zero arguments. However, negative zero arguments will
1158 * be treated as positive zero, which means the return value will deviate from
1159 * the standard math library atan2(y,x) for those cases. That should not be
1160 * of any concern in Gromacs, and in particular it will not affect calculations
1161 * of angles from vectors.
1163 static gmx_inline gmx_simd_float_t gmx_simdcall
1164 gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
1166 const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
1167 const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2.0);
1168 gmx_simd_float_t xinv, p, aoffset;
1169 gmx_simd_fbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
1171 mask_x0 = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
1172 mask_y0 = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
1173 mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
1174 mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
1176 aoffset = gmx_simd_blendzero_f(halfpi, mask_x0);
1177 aoffset = gmx_simd_blendnotzero_f(aoffset, mask_y0);
1179 aoffset = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
1180 aoffset = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
1182 xinv = gmx_simd_blendnotzero_f(gmx_simd_inv_notmaskfpe_f(x, mask_x0), mask_x0);
1183 p = gmx_simd_mul_f(y, xinv);
1184 p = gmx_simd_atan_f(p);
1185 p = gmx_simd_add_f(p, aoffset);
1190 /*! \brief Calculate the force correction due to PME analytically in SIMD float.
1192 * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
1194 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1195 * \result Correction factor to coulomb force - see below for details.
1197 * This routine is meant to enable analytical evaluation of the
1198 * direct-space PME electrostatic force to avoid tables.
1200 * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
1201 * are some problems evaluating that:
1203 * First, the error function is difficult (read: expensive) to
1204 * approxmiate accurately for intermediate to large arguments, and
1205 * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
1206 * Second, we now try to avoid calculating potentials in Gromacs but
1207 * use forces directly.
1209 * We can simply things slight by noting that the PME part is really
1210 * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
1212 * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
1214 * The first term we already have from the inverse square root, so
1215 * that we can leave out of this routine.
1217 * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
1218 * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
1219 * the range used for the minimax fit. Use your favorite plotting program
1220 * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
1222 * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
1223 * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
1224 * then only use even powers. This is another minor optimization, since
1225 * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
1226 * the vector between the two atoms to get the vectorial force. The
1227 * fastest flops are the ones we can avoid calculating!
1229 * So, here's how it should be used:
1231 * 1. Calculate \f$r^2\f$.
1232 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
1233 * 3. Evaluate this routine with \f$z^2\f$ as the argument.
1234 * 4. The return value is the expression:
1237 * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
1240 * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
1243 * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
1246 * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
1249 * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
1252 * With a bit of math exercise you should be able to confirm that
1256 * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
1259 * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
1260 * and you have your force (divided by \f$r\f$). A final multiplication
1261 * with the vector connecting the two particles and you have your
1262 * vectorial force to add to the particles.
1264 * This approximation achieves an error slightly lower than 1e-6
1265 * in single precision and 1e-11 in double precision
1266 * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
1267 * when added to \f$1/r\f$ the error will be insignificant.
1268 * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
1271 static gmx_inline gmx_simd_float_t gmx_simdcall
1272 gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
1274 const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
1275 const gmx_simd_float_t FN5 = gmx_simd_set1_f(1.4703624142580877519e-6f);
1276 const gmx_simd_float_t FN4 = gmx_simd_set1_f(-0.000053401640219807709149f);
1277 const gmx_simd_float_t FN3 = gmx_simd_set1_f(0.0010054721316683106153f);
1278 const gmx_simd_float_t FN2 = gmx_simd_set1_f(-0.019278317264888380590f);
1279 const gmx_simd_float_t FN1 = gmx_simd_set1_f(0.069670166153766424023f);
1280 const gmx_simd_float_t FN0 = gmx_simd_set1_f(-0.75225204789749321333f);
1282 const gmx_simd_float_t FD4 = gmx_simd_set1_f(0.0011193462567257629232f);
1283 const gmx_simd_float_t FD3 = gmx_simd_set1_f(0.014866955030185295499f);
1284 const gmx_simd_float_t FD2 = gmx_simd_set1_f(0.11583842382862377919f);
1285 const gmx_simd_float_t FD1 = gmx_simd_set1_f(0.50736591960530292870f);
1286 const gmx_simd_float_t FD0 = gmx_simd_set1_f(1.0f);
1288 gmx_simd_float_t z4;
1289 gmx_simd_float_t polyFN0, polyFN1, polyFD0, polyFD1;
1291 z4 = gmx_simd_mul_f(z2, z2);
1293 polyFD0 = gmx_simd_fmadd_f(FD4, z4, FD2);
1294 polyFD1 = gmx_simd_fmadd_f(FD3, z4, FD1);
1295 polyFD0 = gmx_simd_fmadd_f(polyFD0, z4, FD0);
1296 polyFD0 = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
1298 polyFD0 = gmx_simd_inv_f(polyFD0);
1300 polyFN0 = gmx_simd_fmadd_f(FN6, z4, FN4);
1301 polyFN1 = gmx_simd_fmadd_f(FN5, z4, FN3);
1302 polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN2);
1303 polyFN1 = gmx_simd_fmadd_f(polyFN1, z4, FN1);
1304 polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN0);
1305 polyFN0 = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
1307 return gmx_simd_mul_f(polyFN0, polyFD0);
1312 /*! \brief Calculate the potential correction due to PME analytically in SIMD float.
1314 * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
1316 * \param z2 \f$(r \beta)^2\f$ - see below for details.
1317 * \result Correction factor to coulomb potential - see below for details.
1319 * See \ref gmx_simd_pmecorrF_f for details about the approximation.
1321 * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
1322 * as the input argument.
1324 * Here's how it should be used:
1326 * 1. Calculate \f$r^2\f$.
1327 * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
1328 * 3. Evaluate this routine with z^2 as the argument.
1329 * 4. The return value is the expression:
1332 * \frac{\mbox{erf}(z)}{z}
1335 * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
1338 * \frac{\mbox{erf}(r \beta)}{r}
1341 * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
1342 * and you have your potential.
1344 * This approximation achieves an error slightly lower than 1e-6
1345 * in single precision and 4e-11 in double precision
1346 * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
1347 * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
1348 * when added to \f$1/r\f$ the error will be insignificant.
1349 * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
1351 static gmx_inline gmx_simd_float_t gmx_simdcall
1352 gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
1354 const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
1355 const gmx_simd_float_t VN5 = gmx_simd_set1_f(-1.4213390571557850962e-6f);
1356 const gmx_simd_float_t VN4 = gmx_simd_set1_f(0.000041603292906656984871f);
1357 const gmx_simd_float_t VN3 = gmx_simd_set1_f(-0.00013134036773265025626f);
1358 const gmx_simd_float_t VN2 = gmx_simd_set1_f(0.038657983986041781264f);
1359 const gmx_simd_float_t VN1 = gmx_simd_set1_f(0.11285044772717598220f);
1360 const gmx_simd_float_t VN0 = gmx_simd_set1_f(1.1283802385263030286f);
1362 const gmx_simd_float_t VD3 = gmx_simd_set1_f(0.0066752224023576045451f);
1363 const gmx_simd_float_t VD2 = gmx_simd_set1_f(0.078647795836373922256f);
1364 const gmx_simd_float_t VD1 = gmx_simd_set1_f(0.43336185284710920150f);
1365 const gmx_simd_float_t VD0 = gmx_simd_set1_f(1.0f);
1367 gmx_simd_float_t z4;
1368 gmx_simd_float_t polyVN0, polyVN1, polyVD0, polyVD1;
1370 z4 = gmx_simd_mul_f(z2, z2);
1372 polyVD1 = gmx_simd_fmadd_f(VD3, z4, VD1);
1373 polyVD0 = gmx_simd_fmadd_f(VD2, z4, VD0);
1374 polyVD0 = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
1376 polyVD0 = gmx_simd_inv_f(polyVD0);
1378 polyVN0 = gmx_simd_fmadd_f(VN6, z4, VN4);
1379 polyVN1 = gmx_simd_fmadd_f(VN5, z4, VN3);
1380 polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN2);
1381 polyVN1 = gmx_simd_fmadd_f(polyVN1, z4, VN1);
1382 polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN0);
1383 polyVN0 = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
1385 return gmx_simd_mul_f(polyVN0, polyVD0);
1391 #ifdef GMX_SIMD_HAVE_DOUBLE
1393 /*! \name Double precision SIMD math functions
1395 * \note In most cases you should use the real-precision functions instead.
1399 /****************************************
1400 * DOUBLE PRECISION SIMD MATH FUNCTIONS *
1401 ****************************************/
1403 /*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
1405 * \copydetails gmx_simd_sum4_f
1407 static gmx_inline gmx_simd_double_t gmx_simdcall
1408 gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
1409 gmx_simd_double_t c, gmx_simd_double_t d)
1411 return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
1414 /*! \brief Return -a if b is negative, SIMD double.
1416 * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
1418 * \param a Values to set sign for
1419 * \param b Values used to set sign
1420 * \return if b is negative, the sign of a will be changed.
1422 * This is equivalent to doing an xor operation on a with the sign bit of b,
1423 * with the exception that negative zero is not considered to be negative
1424 * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
1426 static gmx_inline gmx_simd_double_t gmx_simdcall
1427 gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
1429 #ifdef GMX_SIMD_HAVE_LOGICAL
1430 return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), b));
1432 return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
1436 #ifndef gmx_simd_rsqrt_iter_d
1437 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
1439 * \copydetails gmx_simd_rsqrt_iter_f
1441 static gmx_inline gmx_simd_double_t gmx_simdcall
1442 gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1444 #ifdef GMX_SIMD_HAVE_FMA
1445 return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
1447 return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
1452 /*! \brief Calculate 1/sqrt(x) for SIMD double
1454 * \copydetails gmx_simd_invsqrt_f
1456 static gmx_inline gmx_simd_double_t gmx_simdcall
1457 gmx_simd_invsqrt_d(gmx_simd_double_t x)
1459 gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
1460 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1461 lu = gmx_simd_rsqrt_iter_d(lu, x);
1463 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1464 lu = gmx_simd_rsqrt_iter_d(lu, x);
1466 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1467 lu = gmx_simd_rsqrt_iter_d(lu, x);
1469 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1470 lu = gmx_simd_rsqrt_iter_d(lu, x);
1475 /*! \brief Calculate 1/sqrt(x) for masked entries of SIMD double.
1477 * \copydetails gmx_simd_invsqrt_maskfpe_f
1480 #define gmx_simd_invsqrt_maskfpe_d(x, m) gmx_simd_invsqrt_d(x)
1482 static gmx_inline gmx_simd_double_t
1483 gmx_simd_invsqrt_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t m)
1485 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0f), x, m));
1489 /*! \brief Calculate 1/sqrt(x) for non-masked entries of SIMD double.
1491 * \copydetails gmx_simd_invsqrt_notmaskfpe_f
1494 #define gmx_simd_invsqrt_notmaskfpe_d(x, m) gmx_simd_invsqrt_d(x)
1496 static gmx_inline gmx_simd_double_t
1497 gmx_simd_invsqrt_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t m)
1499 return gmx_simd_invsqrt_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0f), m));
1503 /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
1505 * \copydetails gmx_simd_invsqrt_pair_f
1507 static gmx_inline void gmx_simdcall
1508 gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
1509 gmx_simd_double_t *out0, gmx_simd_double_t *out1)
1511 #if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
1512 gmx_simd_float_t xf = gmx_simd_cvt_dd2f(x0, x1);
1513 gmx_simd_float_t luf = gmx_simd_rsqrt_f(xf);
1514 gmx_simd_double_t lu0, lu1;
1515 /* Intermediate target is single - mantissa+1 bits */
1516 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1517 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1519 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1520 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1522 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
1523 luf = gmx_simd_rsqrt_iter_f(luf, xf);
1525 gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
1526 /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
1527 #if (GMX_SIMD_MATH_TARGET_SINGLE_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1528 lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1529 lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1531 #if (GMX_SIMD_MATH_TARGET_SINGLE_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1532 lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
1533 lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
1538 *out0 = gmx_simd_invsqrt_d(x0);
1539 *out1 = gmx_simd_invsqrt_d(x1);
1543 #ifndef gmx_simd_rcp_iter_d
1544 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
1546 * \copydetails gmx_simd_rcp_iter_f
1548 static gmx_inline gmx_simd_double_t gmx_simdcall
1549 gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
1551 return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
1555 /*! \brief Calculate 1/x for SIMD double.
1557 * \copydetails gmx_simd_inv_f
1559 static gmx_inline gmx_simd_double_t gmx_simdcall
1560 gmx_simd_inv_d(gmx_simd_double_t x)
1562 gmx_simd_double_t lu = gmx_simd_rcp_d(x);
1563 #if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1564 lu = gmx_simd_rcp_iter_d(lu, x);
1566 #if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1567 lu = gmx_simd_rcp_iter_d(lu, x);
1569 #if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1570 lu = gmx_simd_rcp_iter_d(lu, x);
1572 #if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
1573 lu = gmx_simd_rcp_iter_d(lu, x);
1578 /*! \brief Calculate 1/x for masked entries of SIMD double.
1580 * \copydetails gmx_simd_inv_maskfpe_f
1583 #define gmx_simd_inv_maskfpe_d(x, m) gmx_simd_inv_d(x)
1585 static gmx_inline gmx_simd_double_t
1586 gmx_simd_inv_maskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t m)
1588 return gmx_simd_inv_d(gmx_simd_blendv_d(gmx_simd_set1_d(1.0f), x, m));
1592 /*! \brief Calculate 1/x for non-masked entries of SIMD double.
1594 * \copydetails gmx_simd_inv_notmaskfpe_f
1597 #define gmx_simd_inv_notmaskfpe_d(x, m) gmx_simd_inv_d(x)
1599 static gmx_inline gmx_simd_double_t
1600 gmx_simd_inv_notmaskfpe_d(gmx_simd_double_t x, gmx_simd_dbool_t m)
1602 return gmx_simd_inv_d(gmx_simd_blendv_d(x, gmx_simd_set1_d(1.0f), m));
1606 /*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
1608 * \copydetails gmx_simd_sqrt_f
1610 static gmx_inline gmx_simd_double_t gmx_simdcall
1611 gmx_simd_sqrt_d(gmx_simd_double_t x)
1613 gmx_simd_dbool_t mask;
1614 gmx_simd_double_t res;
1616 mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
1617 res = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_notmaskfpe_d(x, mask), mask);
1618 return gmx_simd_mul_d(res, x);
1621 /*! \brief SIMD double log(x). This is the natural logarithm.
1623 * \copydetails gmx_simd_log_f
1625 static gmx_inline gmx_simd_double_t gmx_simdcall
1626 gmx_simd_log_d(gmx_simd_double_t x)
1628 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
1629 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1630 const gmx_simd_double_t sqrt2 = gmx_simd_set1_d(sqrt(2.0));
1631 const gmx_simd_double_t corr = gmx_simd_set1_d(0.693147180559945286226764);
1632 const gmx_simd_double_t CL15 = gmx_simd_set1_d(0.148197055177935105296783);
1633 const gmx_simd_double_t CL13 = gmx_simd_set1_d(0.153108178020442575739679);
1634 const gmx_simd_double_t CL11 = gmx_simd_set1_d(0.181837339521549679055568);
1635 const gmx_simd_double_t CL9 = gmx_simd_set1_d(0.22222194152736701733275);
1636 const gmx_simd_double_t CL7 = gmx_simd_set1_d(0.285714288030134544449368);
1637 const gmx_simd_double_t CL5 = gmx_simd_set1_d(0.399999999989941956712869);
1638 const gmx_simd_double_t CL3 = gmx_simd_set1_d(0.666666666666685503450651);
1639 const gmx_simd_double_t CL1 = gmx_simd_set1_d(2.0);
1640 gmx_simd_double_t fexp, x2, p;
1641 gmx_simd_dbool_t mask;
1643 fexp = gmx_simd_get_exponent_d(x);
1644 x = gmx_simd_get_mantissa_d(x);
1646 mask = gmx_simd_cmplt_d(sqrt2, x);
1647 /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
1648 fexp = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
1649 x = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
1651 x = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
1652 x2 = gmx_simd_mul_d(x, x);
1654 p = gmx_simd_fmadd_d(CL15, x2, CL13);
1655 p = gmx_simd_fmadd_d(p, x2, CL11);
1656 p = gmx_simd_fmadd_d(p, x2, CL9);
1657 p = gmx_simd_fmadd_d(p, x2, CL7);
1658 p = gmx_simd_fmadd_d(p, x2, CL5);
1659 p = gmx_simd_fmadd_d(p, x2, CL3);
1660 p = gmx_simd_fmadd_d(p, x2, CL1);
1661 p = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
1666 /*! \brief SIMD double 2^x.
1668 * \copydetails gmx_simd_exp2_f
1670 static gmx_inline gmx_simd_double_t gmx_simdcall
1671 gmx_simd_exp2_d(gmx_simd_double_t x)
1673 const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
1674 const gmx_simd_double_t CE11 = gmx_simd_set1_d(4.435280790452730022081181e-10);
1675 const gmx_simd_double_t CE10 = gmx_simd_set1_d(7.074105630863314448024247e-09);
1676 const gmx_simd_double_t CE9 = gmx_simd_set1_d(1.017819803432096698472621e-07);
1677 const gmx_simd_double_t CE8 = gmx_simd_set1_d(1.321543308956718799557863e-06);
1678 const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.00001525273348995851746990884);
1679 const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.0001540353046251466849082632);
1680 const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.001333355814678995257307880);
1681 const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.009618129107588335039176502);
1682 const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.05550410866481992147457793);
1683 const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.2402265069591015620470894);
1684 const gmx_simd_double_t CE1 = gmx_simd_set1_d(0.6931471805599453304615075);
1685 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1686 gmx_simd_double_t fexppart;
1687 gmx_simd_double_t intpart;
1688 gmx_simd_double_t p;
1689 gmx_simd_dbool_t valuemask;
1691 fexppart = gmx_simd_set_exponent_d(x); /* rounds to nearest int internally */
1692 intpart = gmx_simd_round_d(x); /* use same rounding mode here */
1693 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
1694 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
1695 x = gmx_simd_sub_d(x, intpart);
1697 p = gmx_simd_fmadd_d(CE11, x, CE10);
1698 p = gmx_simd_fmadd_d(p, x, CE9);
1699 p = gmx_simd_fmadd_d(p, x, CE8);
1700 p = gmx_simd_fmadd_d(p, x, CE7);
1701 p = gmx_simd_fmadd_d(p, x, CE6);
1702 p = gmx_simd_fmadd_d(p, x, CE5);
1703 p = gmx_simd_fmadd_d(p, x, CE4);
1704 p = gmx_simd_fmadd_d(p, x, CE3);
1705 p = gmx_simd_fmadd_d(p, x, CE2);
1706 p = gmx_simd_fmadd_d(p, x, CE1);
1707 p = gmx_simd_fmadd_d(p, x, one);
1708 x = gmx_simd_mul_d(p, fexppart);
1712 /*! \brief SIMD double exp(x).
1714 * \copydetails gmx_simd_exp_f
1716 static gmx_inline gmx_simd_double_t gmx_simdcall
1717 gmx_simd_exp_d(gmx_simd_double_t x)
1719 const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
1720 const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
1721 const gmx_simd_double_t invargscale0 = gmx_simd_set1_d(-0.69314718055966295651160180568695068359375);
1722 const gmx_simd_double_t invargscale1 = gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13);
1723 const gmx_simd_double_t CE12 = gmx_simd_set1_d(2.078375306791423699350304e-09);
1724 const gmx_simd_double_t CE11 = gmx_simd_set1_d(2.518173854179933105218635e-08);
1725 const gmx_simd_double_t CE10 = gmx_simd_set1_d(2.755842049600488770111608e-07);
1726 const gmx_simd_double_t CE9 = gmx_simd_set1_d(2.755691815216689746619849e-06);
1727 const gmx_simd_double_t CE8 = gmx_simd_set1_d(2.480158383706245033920920e-05);
1728 const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.0001984127043518048611841321);
1729 const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.001388888889360258341755930);
1730 const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.008333333332907368102819109);
1731 const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.04166666666663836745814631);
1732 const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.1666666666666796929434570);
1733 const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.5);
1734 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1735 gmx_simd_double_t fexppart;
1736 gmx_simd_double_t intpart;
1737 gmx_simd_double_t y, p;
1738 gmx_simd_dbool_t valuemask;
1740 y = gmx_simd_mul_d(x, argscale);
1741 fexppart = gmx_simd_set_exponent_d(y); /* rounds to nearest int internally */
1742 intpart = gmx_simd_round_d(y); /* use same rounding mode here */
1743 valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
1744 fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
1746 /* Extended precision arithmetics */
1747 x = gmx_simd_fmadd_d(invargscale0, intpart, x);
1748 x = gmx_simd_fmadd_d(invargscale1, intpart, x);
1750 p = gmx_simd_fmadd_d(CE12, x, CE11);
1751 p = gmx_simd_fmadd_d(p, x, CE10);
1752 p = gmx_simd_fmadd_d(p, x, CE9);
1753 p = gmx_simd_fmadd_d(p, x, CE8);
1754 p = gmx_simd_fmadd_d(p, x, CE7);
1755 p = gmx_simd_fmadd_d(p, x, CE6);
1756 p = gmx_simd_fmadd_d(p, x, CE5);
1757 p = gmx_simd_fmadd_d(p, x, CE4);
1758 p = gmx_simd_fmadd_d(p, x, CE3);
1759 p = gmx_simd_fmadd_d(p, x, CE2);
1760 p = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
1761 x = gmx_simd_mul_d(p, fexppart);
1765 /*! \brief SIMD double erf(x).
1767 * \copydetails gmx_simd_erf_f
1769 static gmx_inline gmx_simd_double_t gmx_simdcall
1770 gmx_simd_erf_d(gmx_simd_double_t x)
1772 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1773 const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
1774 const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
1775 const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
1776 const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
1777 const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
1779 const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
1780 const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
1781 const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
1782 const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
1783 const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
1785 const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
1787 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1788 const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1789 const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
1790 const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
1791 const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
1792 const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
1793 const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
1794 const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
1795 const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
1796 const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
1797 const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
1798 const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
1799 const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
1800 const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
1801 const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
1804 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1805 const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
1806 const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
1807 const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
1808 const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
1809 const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
1810 const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
1811 const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
1813 const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
1814 const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
1815 const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
1816 const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
1817 const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
1818 const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
1820 const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
1822 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
1823 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
1825 gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
1826 gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
1827 gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
1828 gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
1829 gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
1830 gmx_simd_double_t expmx2;
1831 gmx_simd_dbool_t mask, mask_erf;
1833 /* Calculate erf() */
1834 xabs = gmx_simd_fabs_d(x);
1835 mask_erf = gmx_simd_cmplt_d(xabs, one);
1836 x2 = gmx_simd_mul_d(x, x);
1837 x4 = gmx_simd_mul_d(x2, x2);
1839 PolyAP0 = gmx_simd_mul_d(CAP4, x4);
1840 PolyAP1 = gmx_simd_mul_d(CAP3, x4);
1841 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
1842 PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
1843 PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
1844 PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
1845 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
1846 PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
1848 PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
1849 PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
1850 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
1851 PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
1852 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
1853 PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
1854 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
1855 PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
1856 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
1857 PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
1859 res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
1860 res_erf = gmx_simd_add_d(CAoffset, res_erf);
1861 res_erf = gmx_simd_mul_d(x, res_erf);
1863 /* Calculate erfc() in range [1,4.5] */
1864 t = gmx_simd_sub_d(xabs, one);
1865 t2 = gmx_simd_mul_d(t, t);
1867 PolyBP0 = gmx_simd_mul_d(CBP6, t2);
1868 PolyBP1 = gmx_simd_mul_d(CBP5, t2);
1869 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
1870 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
1871 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
1872 PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
1873 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
1874 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
1875 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
1876 PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
1877 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
1878 PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
1880 PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
1881 PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
1882 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
1883 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
1884 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1885 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1886 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
1887 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
1888 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
1889 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
1890 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
1891 PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
1892 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
1893 PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
1895 res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
1897 res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
1899 /* Calculate erfc() in range [4.5,inf] */
1900 w = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
1901 w2 = gmx_simd_mul_d(w, w);
1903 PolyCP0 = gmx_simd_mul_d(CCP6, w2);
1904 PolyCP1 = gmx_simd_mul_d(CCP5, w2);
1905 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
1906 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
1907 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
1908 PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
1909 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
1910 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
1911 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
1912 PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
1913 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
1914 PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
1916 PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
1917 PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
1918 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
1919 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
1920 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
1921 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
1922 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
1923 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
1924 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
1925 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
1926 PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
1927 PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
1929 expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
1931 res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
1932 res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
1933 res_erfcC = gmx_simd_mul_d(res_erfcC, w);
1935 mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
1936 res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
1938 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
1940 /* erfc(x<0) = 2-erfc(|x|) */
1941 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
1942 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
1944 /* Select erf() or erfc() */
1945 res = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask_erf);
1950 /*! \brief SIMD double erfc(x).
1952 * \copydetails gmx_simd_erfc_f
1954 static gmx_inline gmx_simd_double_t gmx_simdcall
1955 gmx_simd_erfc_d(gmx_simd_double_t x)
1957 /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
1958 const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
1959 const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
1960 const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
1961 const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
1962 const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
1964 const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
1965 const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
1966 const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
1967 const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
1968 const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
1970 const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
1972 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
1973 const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
1974 const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
1975 const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
1976 const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
1977 const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
1978 const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
1979 const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
1980 const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
1981 const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
1982 const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
1983 const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
1984 const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
1985 const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
1986 const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
1989 /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
1990 const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
1991 const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
1992 const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
1993 const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
1994 const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
1995 const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
1996 const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
1998 const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
1999 const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
2000 const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
2001 const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
2002 const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
2003 const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
2005 const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
2007 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2008 const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
2010 gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
2011 gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
2012 gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
2013 gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
2014 gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
2015 gmx_simd_double_t expmx2;
2016 gmx_simd_dbool_t mask, mask_erf;
2018 /* Calculate erf() */
2019 xabs = gmx_simd_fabs_d(x);
2020 mask_erf = gmx_simd_cmplt_d(xabs, one);
2021 x2 = gmx_simd_mul_d(x, x);
2022 x4 = gmx_simd_mul_d(x2, x2);
2024 PolyAP0 = gmx_simd_mul_d(CAP4, x4);
2025 PolyAP1 = gmx_simd_mul_d(CAP3, x4);
2026 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
2027 PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
2028 PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
2029 PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
2030 PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
2031 PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
2033 PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
2034 PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
2035 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
2036 PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
2037 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
2038 PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
2039 PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
2040 PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
2041 PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
2042 PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
2044 res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_maskfpe_d(PolyAQ0, mask_erf));
2045 res_erf = gmx_simd_add_d(CAoffset, res_erf);
2046 res_erf = gmx_simd_mul_d(x, res_erf);
2048 /* Calculate erfc() in range [1,4.5] */
2049 t = gmx_simd_sub_d(xabs, one);
2050 t2 = gmx_simd_mul_d(t, t);
2052 PolyBP0 = gmx_simd_mul_d(CBP6, t2);
2053 PolyBP1 = gmx_simd_mul_d(CBP5, t2);
2054 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
2055 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
2056 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
2057 PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
2058 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
2059 PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
2060 PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
2061 PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
2062 PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
2063 PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
2065 PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
2066 PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
2067 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
2068 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
2069 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2070 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2071 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
2072 PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
2073 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
2074 PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
2075 PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
2076 PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
2077 PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
2078 PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
2080 res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_notmaskfpe_d(PolyBQ0, mask_erf));
2082 res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
2084 /* Calculate erfc() in range [4.5,inf] */
2085 w = gmx_simd_inv_notmaskfpe_d(xabs, mask_erf);
2086 w2 = gmx_simd_mul_d(w, w);
2088 PolyCP0 = gmx_simd_mul_d(CCP6, w2);
2089 PolyCP1 = gmx_simd_mul_d(CCP5, w2);
2090 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
2091 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
2092 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
2093 PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
2094 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
2095 PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
2096 PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
2097 PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
2098 PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
2099 PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
2101 PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
2102 PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
2103 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
2104 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
2105 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
2106 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
2107 PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
2108 PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
2109 PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
2110 PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
2111 PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
2112 PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
2114 expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
2116 res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_notmaskfpe_d(PolyCQ0, mask_erf));
2117 res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
2118 res_erfcC = gmx_simd_mul_d(res_erfcC, w);
2120 mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
2121 res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
2123 res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
2125 /* erfc(x<0) = 2-erfc(|x|) */
2126 mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2127 res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
2129 /* Select erf() or erfc() */
2130 res = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask_erf);
2135 /*! \brief SIMD double sin \& cos.
2137 * \copydetails gmx_simd_sincos_f
2139 static gmx_inline void gmx_simdcall
2140 gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
2142 /* Constants to subtract Pi/4*x from y while minimizing precision loss */
2143 const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978);
2144 const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2145 const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2146 const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2147 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
2148 const gmx_simd_double_t const_sin5 = gmx_simd_set1_d( 1.58938307283228937328511e-10);
2149 const gmx_simd_double_t const_sin4 = gmx_simd_set1_d(-2.50506943502539773349318e-08);
2150 const gmx_simd_double_t const_sin3 = gmx_simd_set1_d( 2.75573131776846360512547e-06);
2151 const gmx_simd_double_t const_sin2 = gmx_simd_set1_d(-0.000198412698278911770864914);
2152 const gmx_simd_double_t const_sin1 = gmx_simd_set1_d( 0.0083333333333191845961746);
2153 const gmx_simd_double_t const_sin0 = gmx_simd_set1_d(-0.166666666666666130709393);
2155 const gmx_simd_double_t const_cos7 = gmx_simd_set1_d(-1.13615350239097429531523e-11);
2156 const gmx_simd_double_t const_cos6 = gmx_simd_set1_d( 2.08757471207040055479366e-09);
2157 const gmx_simd_double_t const_cos5 = gmx_simd_set1_d(-2.75573144028847567498567e-07);
2158 const gmx_simd_double_t const_cos4 = gmx_simd_set1_d( 2.48015872890001867311915e-05);
2159 const gmx_simd_double_t const_cos3 = gmx_simd_set1_d(-0.00138888888888714019282329);
2160 const gmx_simd_double_t const_cos2 = gmx_simd_set1_d( 0.0416666666666665519592062);
2161 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2162 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2163 gmx_simd_double_t ssign, csign;
2164 gmx_simd_double_t x2, y, z, psin, pcos, sss, ccc;
2165 gmx_simd_dbool_t mask;
2166 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2167 const gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
2168 const gmx_simd_dint32_t itwo = gmx_simd_set1_di(2);
2169 gmx_simd_dint32_t iy;
2171 z = gmx_simd_mul_d(x, two_over_pi);
2172 iy = gmx_simd_cvt_d2i(z);
2173 y = gmx_simd_round_d(z);
2175 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
2176 ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
2177 csign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
2179 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
2180 const gmx_simd_double_t minusquarter = gmx_simd_set1_d(-0.25);
2181 gmx_simd_double_t q;
2182 gmx_simd_dbool_t m1, m2, m3;
2184 /* The most obvious way to find the arguments quadrant in the unit circle
2185 * to calculate the sign is to use integer arithmetic, but that is not
2186 * present in all SIMD implementations. As an alternative, we have devised a
2187 * pure floating-point algorithm that uses truncation for argument reduction
2188 * so that we get a new value 0<=q<1 over the unit circle, and then
2189 * do floating-point comparisons with fractions. This is likely to be
2190 * slightly slower (~10%) due to the longer latencies of floating-point, so
2191 * we only use it when integer SIMD arithmetic is not present.
2194 x = gmx_simd_fabs_d(x);
2195 /* It is critical that half-way cases are rounded down */
2196 z = gmx_simd_fmadd_d(x, two_over_pi, half);
2197 y = gmx_simd_trunc_d(z);
2198 q = gmx_simd_mul_d(z, quarter);
2199 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2200 /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
2201 * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
2202 * This removes the 2*Pi periodicity without using any integer arithmetic.
2203 * First check if y had the value 2 or 3, set csign if true.
2205 q = gmx_simd_sub_d(q, half);
2206 /* If we have logical operations we can work directly on the signbit, which
2207 * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
2208 * Thus, if you are altering defines to debug alternative code paths, the
2209 * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
2210 * active or inactive - you will get errors if only one is used.
2212 # ifdef GMX_SIMD_HAVE_LOGICAL
2213 ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2214 csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
2215 ssign = gmx_simd_xor_d(ssign, csign);
2217 csign = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
2218 ssign = gmx_simd_xor_sign_d(ssign, csign); /* swap ssign if csign was set. */
2220 /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
2221 m1 = gmx_simd_cmplt_d(q, minusquarter);
2222 m2 = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
2223 m3 = gmx_simd_cmplt_d(q, quarter);
2224 m2 = gmx_simd_and_db(m2, m3);
2225 mask = gmx_simd_or_db(m1, m2);
2226 /* where mask is FALSE, set sign. */
2227 csign = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
2229 x = gmx_simd_fmadd_d(y, argred0, x);
2230 x = gmx_simd_fmadd_d(y, argred1, x);
2231 x = gmx_simd_fmadd_d(y, argred2, x);
2232 x = gmx_simd_fmadd_d(y, argred3, x);
2233 x2 = gmx_simd_mul_d(x, x);
2235 psin = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
2236 psin = gmx_simd_fmadd_d(psin, x2, const_sin3);
2237 psin = gmx_simd_fmadd_d(psin, x2, const_sin2);
2238 psin = gmx_simd_fmadd_d(psin, x2, const_sin1);
2239 psin = gmx_simd_fmadd_d(psin, x2, const_sin0);
2240 psin = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
2242 pcos = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
2243 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos5);
2244 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos4);
2245 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos3);
2246 pcos = gmx_simd_fmadd_d(pcos, x2, const_cos2);
2247 pcos = gmx_simd_fmsub_d(pcos, x2, half);
2248 pcos = gmx_simd_fmadd_d(pcos, x2, one);
2250 sss = gmx_simd_blendv_d(pcos, psin, mask);
2251 ccc = gmx_simd_blendv_d(psin, pcos, mask);
2252 /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
2253 #ifdef GMX_SIMD_HAVE_LOGICAL
2254 *sinval = gmx_simd_xor_d(sss, ssign);
2255 *cosval = gmx_simd_xor_d(ccc, csign);
2257 *sinval = gmx_simd_xor_sign_d(sss, ssign);
2258 *cosval = gmx_simd_xor_sign_d(ccc, csign);
2262 /*! \brief SIMD double sin(x).
2264 * \copydetails gmx_simd_sin_f
2266 static gmx_inline gmx_simd_double_t gmx_simdcall
2267 gmx_simd_sin_d(gmx_simd_double_t x)
2269 gmx_simd_double_t s, c;
2270 gmx_simd_sincos_d(x, &s, &c);
2274 /*! \brief SIMD double cos(x).
2276 * \copydetails gmx_simd_cos_f
2278 static gmx_inline gmx_simd_double_t gmx_simdcall
2279 gmx_simd_cos_d(gmx_simd_double_t x)
2281 gmx_simd_double_t s, c;
2282 gmx_simd_sincos_d(x, &s, &c);
2286 /*! \brief SIMD double tan(x).
2288 * \copydetails gmx_simd_tan_f
2290 static gmx_inline gmx_simd_double_t gmx_simdcall
2291 gmx_simd_tan_d(gmx_simd_double_t x)
2293 const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978);
2294 const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
2295 const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
2296 const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
2297 const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
2298 const gmx_simd_double_t CT15 = gmx_simd_set1_d(1.01419718511083373224408e-05);
2299 const gmx_simd_double_t CT14 = gmx_simd_set1_d(-2.59519791585924697698614e-05);
2300 const gmx_simd_double_t CT13 = gmx_simd_set1_d(5.23388081915899855325186e-05);
2301 const gmx_simd_double_t CT12 = gmx_simd_set1_d(-3.05033014433946488225616e-05);
2302 const gmx_simd_double_t CT11 = gmx_simd_set1_d(7.14707504084242744267497e-05);
2303 const gmx_simd_double_t CT10 = gmx_simd_set1_d(8.09674518280159187045078e-05);
2304 const gmx_simd_double_t CT9 = gmx_simd_set1_d(0.000244884931879331847054404);
2305 const gmx_simd_double_t CT8 = gmx_simd_set1_d(0.000588505168743587154904506);
2306 const gmx_simd_double_t CT7 = gmx_simd_set1_d(0.00145612788922812427978848);
2307 const gmx_simd_double_t CT6 = gmx_simd_set1_d(0.00359208743836906619142924);
2308 const gmx_simd_double_t CT5 = gmx_simd_set1_d(0.00886323944362401618113356);
2309 const gmx_simd_double_t CT4 = gmx_simd_set1_d(0.0218694882853846389592078);
2310 const gmx_simd_double_t CT3 = gmx_simd_set1_d(0.0539682539781298417636002);
2311 const gmx_simd_double_t CT2 = gmx_simd_set1_d(0.133333333333125941821962);
2312 const gmx_simd_double_t CT1 = gmx_simd_set1_d(0.333333333333334980164153);
2314 gmx_simd_double_t x2, p, y, z;
2315 gmx_simd_dbool_t mask;
2317 #if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
2318 gmx_simd_dint32_t iy;
2319 gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
2321 z = gmx_simd_mul_d(x, two_over_pi);
2322 iy = gmx_simd_cvt_d2i(z);
2323 y = gmx_simd_round_d(z);
2324 mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
2326 x = gmx_simd_fmadd_d(y, argred0, x);
2327 x = gmx_simd_fmadd_d(y, argred1, x);
2328 x = gmx_simd_fmadd_d(y, argred2, x);
2329 x = gmx_simd_fmadd_d(y, argred3, x);
2330 x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
2332 const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
2333 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2334 const gmx_simd_double_t threequarter = gmx_simd_set1_d(0.75);
2335 gmx_simd_double_t w, q;
2336 gmx_simd_dbool_t m1, m2, m3;
2338 w = gmx_simd_fabs_d(x);
2339 z = gmx_simd_fmadd_d(w, two_over_pi, half);
2340 y = gmx_simd_trunc_d(z);
2341 q = gmx_simd_mul_d(z, quarter);
2342 q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
2343 m1 = gmx_simd_cmple_d(quarter, q);
2344 m2 = gmx_simd_cmplt_d(q, half);
2345 m3 = gmx_simd_cmple_d(threequarter, q);
2346 m1 = gmx_simd_and_db(m1, m2);
2347 mask = gmx_simd_or_db(m1, m3);
2348 w = gmx_simd_fmadd_d(y, argred0, w);
2349 w = gmx_simd_fmadd_d(y, argred1, w);
2350 w = gmx_simd_fmadd_d(y, argred2, w);
2351 w = gmx_simd_fmadd_d(y, argred3, w);
2353 w = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
2354 x = gmx_simd_xor_sign_d(w, x);
2356 x2 = gmx_simd_mul_d(x, x);
2357 p = gmx_simd_fmadd_d(CT15, x2, CT14);
2358 p = gmx_simd_fmadd_d(p, x2, CT13);
2359 p = gmx_simd_fmadd_d(p, x2, CT12);
2360 p = gmx_simd_fmadd_d(p, x2, CT11);
2361 p = gmx_simd_fmadd_d(p, x2, CT10);
2362 p = gmx_simd_fmadd_d(p, x2, CT9);
2363 p = gmx_simd_fmadd_d(p, x2, CT8);
2364 p = gmx_simd_fmadd_d(p, x2, CT7);
2365 p = gmx_simd_fmadd_d(p, x2, CT6);
2366 p = gmx_simd_fmadd_d(p, x2, CT5);
2367 p = gmx_simd_fmadd_d(p, x2, CT4);
2368 p = gmx_simd_fmadd_d(p, x2, CT3);
2369 p = gmx_simd_fmadd_d(p, x2, CT2);
2370 p = gmx_simd_fmadd_d(p, x2, CT1);
2371 p = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
2373 p = gmx_simd_blendv_d( p, gmx_simd_inv_maskfpe_d(p, mask), mask);
2377 /*! \brief SIMD double asin(x).
2379 * \copydetails gmx_simd_asin_f
2381 static gmx_inline gmx_simd_double_t gmx_simdcall
2382 gmx_simd_asin_d(gmx_simd_double_t x)
2384 /* Same algorithm as cephes library */
2385 const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.625);
2386 const gmx_simd_double_t limit2 = gmx_simd_set1_d(1e-8);
2387 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2388 const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2389 const gmx_simd_double_t morebits = gmx_simd_set1_d(6.123233995736765886130e-17);
2391 const gmx_simd_double_t P5 = gmx_simd_set1_d(4.253011369004428248960e-3);
2392 const gmx_simd_double_t P4 = gmx_simd_set1_d(-6.019598008014123785661e-1);
2393 const gmx_simd_double_t P3 = gmx_simd_set1_d(5.444622390564711410273e0);
2394 const gmx_simd_double_t P2 = gmx_simd_set1_d(-1.626247967210700244449e1);
2395 const gmx_simd_double_t P1 = gmx_simd_set1_d(1.956261983317594739197e1);
2396 const gmx_simd_double_t P0 = gmx_simd_set1_d(-8.198089802484824371615e0);
2398 const gmx_simd_double_t Q4 = gmx_simd_set1_d(-1.474091372988853791896e1);
2399 const gmx_simd_double_t Q3 = gmx_simd_set1_d(7.049610280856842141659e1);
2400 const gmx_simd_double_t Q2 = gmx_simd_set1_d(-1.471791292232726029859e2);
2401 const gmx_simd_double_t Q1 = gmx_simd_set1_d(1.395105614657485689735e2);
2402 const gmx_simd_double_t Q0 = gmx_simd_set1_d(-4.918853881490881290097e1);
2404 const gmx_simd_double_t R4 = gmx_simd_set1_d(2.967721961301243206100e-3);
2405 const gmx_simd_double_t R3 = gmx_simd_set1_d(-5.634242780008963776856e-1);
2406 const gmx_simd_double_t R2 = gmx_simd_set1_d(6.968710824104713396794e0);
2407 const gmx_simd_double_t R1 = gmx_simd_set1_d(-2.556901049652824852289e1);
2408 const gmx_simd_double_t R0 = gmx_simd_set1_d(2.853665548261061424989e1);
2410 const gmx_simd_double_t S3 = gmx_simd_set1_d(-2.194779531642920639778e1);
2411 const gmx_simd_double_t S2 = gmx_simd_set1_d(1.470656354026814941758e2);
2412 const gmx_simd_double_t S1 = gmx_simd_set1_d(-3.838770957603691357202e2);
2413 const gmx_simd_double_t S0 = gmx_simd_set1_d(3.424398657913078477438e2);
2415 gmx_simd_double_t xabs;
2416 gmx_simd_double_t zz, ww, z, q, w, zz2, ww2;
2417 gmx_simd_double_t PA, PB;
2418 gmx_simd_double_t QA, QB;
2419 gmx_simd_double_t RA, RB;
2420 gmx_simd_double_t SA, SB;
2421 gmx_simd_double_t nom, denom;
2422 gmx_simd_dbool_t mask, mask2;
2424 xabs = gmx_simd_fabs_d(x);
2426 mask = gmx_simd_cmplt_d(limit1, xabs);
2428 zz = gmx_simd_sub_d(one, xabs);
2429 ww = gmx_simd_mul_d(xabs, xabs);
2430 zz2 = gmx_simd_mul_d(zz, zz);
2431 ww2 = gmx_simd_mul_d(ww, ww);
2434 RA = gmx_simd_mul_d(R4, zz2);
2435 RB = gmx_simd_mul_d(R3, zz2);
2436 RA = gmx_simd_add_d(RA, R2);
2437 RB = gmx_simd_add_d(RB, R1);
2438 RA = gmx_simd_mul_d(RA, zz2);
2439 RB = gmx_simd_mul_d(RB, zz);
2440 RA = gmx_simd_add_d(RA, R0);
2441 RA = gmx_simd_add_d(RA, RB);
2444 SB = gmx_simd_mul_d(S3, zz2);
2445 SA = gmx_simd_add_d(zz2, S2);
2446 SB = gmx_simd_add_d(SB, S1);
2447 SA = gmx_simd_mul_d(SA, zz2);
2448 SB = gmx_simd_mul_d(SB, zz);
2449 SA = gmx_simd_add_d(SA, S0);
2450 SA = gmx_simd_add_d(SA, SB);
2453 PA = gmx_simd_mul_d(P5, ww2);
2454 PB = gmx_simd_mul_d(P4, ww2);
2455 PA = gmx_simd_add_d(PA, P3);
2456 PB = gmx_simd_add_d(PB, P2);
2457 PA = gmx_simd_mul_d(PA, ww2);
2458 PB = gmx_simd_mul_d(PB, ww2);
2459 PA = gmx_simd_add_d(PA, P1);
2460 PB = gmx_simd_add_d(PB, P0);
2461 PA = gmx_simd_mul_d(PA, ww);
2462 PA = gmx_simd_add_d(PA, PB);
2465 QB = gmx_simd_mul_d(Q4, ww2);
2466 QA = gmx_simd_add_d(ww2, Q3);
2467 QB = gmx_simd_add_d(QB, Q2);
2468 QA = gmx_simd_mul_d(QA, ww2);
2469 QB = gmx_simd_mul_d(QB, ww2);
2470 QA = gmx_simd_add_d(QA, Q1);
2471 QB = gmx_simd_add_d(QB, Q0);
2472 QA = gmx_simd_mul_d(QA, ww);
2473 QA = gmx_simd_add_d(QA, QB);
2475 RA = gmx_simd_mul_d(RA, zz);
2476 PA = gmx_simd_mul_d(PA, ww);
2478 nom = gmx_simd_blendv_d( PA, RA, mask );
2479 denom = gmx_simd_blendv_d( QA, SA, mask );
2481 mask2 = gmx_simd_cmplt_d(limit2, xabs);
2482 q = gmx_simd_mul_d( nom, gmx_simd_inv_maskfpe_d(denom, mask2) );
2484 zz = gmx_simd_add_d(zz, zz);
2485 zz = gmx_simd_sqrt_d(zz);
2486 z = gmx_simd_sub_d(quarterpi, zz);
2487 zz = gmx_simd_mul_d(zz, q);
2488 zz = gmx_simd_sub_d(zz, morebits);
2489 z = gmx_simd_sub_d(z, zz);
2490 z = gmx_simd_add_d(z, quarterpi);
2492 w = gmx_simd_mul_d(xabs, q);
2493 w = gmx_simd_add_d(w, xabs);
2495 z = gmx_simd_blendv_d( w, z, mask );
2497 z = gmx_simd_blendv_d( xabs, z, mask2 );
2499 z = gmx_simd_xor_sign_d(z, x);
2504 /*! \brief SIMD double acos(x).
2506 * \copydetails gmx_simd_acos_f
2508 static gmx_inline gmx_simd_double_t gmx_simdcall
2509 gmx_simd_acos_d(gmx_simd_double_t x)
2511 const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
2512 const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
2513 const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
2514 const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
2516 gmx_simd_dbool_t mask1;
2517 gmx_simd_double_t z, z1, z2;
2519 mask1 = gmx_simd_cmplt_d(half, x);
2520 z1 = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
2521 z1 = gmx_simd_sqrt_d(z1);
2522 z = gmx_simd_blendv_d( x, z1, mask1 );
2524 z = gmx_simd_asin_d(z);
2526 z1 = gmx_simd_add_d(z, z);
2528 z2 = gmx_simd_sub_d(quarterpi0, z);
2529 z2 = gmx_simd_add_d(z2, quarterpi1);
2530 z2 = gmx_simd_add_d(z2, quarterpi0);
2532 z = gmx_simd_blendv_d(z2, z1, mask1);
2537 /*! \brief SIMD double atan(x).
2539 * \copydetails gmx_simd_atan_f
2541 static gmx_inline gmx_simd_double_t gmx_simdcall
2542 gmx_simd_atan_d(gmx_simd_double_t x)
2544 /* Same algorithm as cephes library */
2545 const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.66);
2546 const gmx_simd_double_t limit2 = gmx_simd_set1_d(2.41421356237309504880);
2547 const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
2548 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
2549 const gmx_simd_double_t mone = gmx_simd_set1_d(-1.0);
2550 const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
2551 const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
2553 const gmx_simd_double_t P4 = gmx_simd_set1_d(-8.750608600031904122785E-1);
2554 const gmx_simd_double_t P3 = gmx_simd_set1_d(-1.615753718733365076637E1);
2555 const gmx_simd_double_t P2 = gmx_simd_set1_d(-7.500855792314704667340E1);
2556 const gmx_simd_double_t P1 = gmx_simd_set1_d(-1.228866684490136173410E2);
2557 const gmx_simd_double_t P0 = gmx_simd_set1_d(-6.485021904942025371773E1);
2559 const gmx_simd_double_t Q4 = gmx_simd_set1_d(2.485846490142306297962E1);
2560 const gmx_simd_double_t Q3 = gmx_simd_set1_d(1.650270098316988542046E2);
2561 const gmx_simd_double_t Q2 = gmx_simd_set1_d(4.328810604912902668951E2);
2562 const gmx_simd_double_t Q1 = gmx_simd_set1_d(4.853903996359136964868E2);
2563 const gmx_simd_double_t Q0 = gmx_simd_set1_d(1.945506571482613964425E2);
2565 gmx_simd_double_t y, xabs, t1, t2;
2566 gmx_simd_double_t z, z2;
2567 gmx_simd_double_t P_A, P_B, Q_A, Q_B;
2568 gmx_simd_dbool_t mask1, mask2;
2570 xabs = gmx_simd_fabs_d(x);
2572 mask1 = gmx_simd_cmplt_d(limit1, xabs);
2573 mask2 = gmx_simd_cmplt_d(limit2, xabs);
2575 t1 = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone),
2576 gmx_simd_inv_maskfpe_d(gmx_simd_sub_d(xabs, mone), mask1));
2577 t2 = gmx_simd_mul_d(mone, gmx_simd_inv_maskfpe_d(xabs, mask2));
2579 y = gmx_simd_blendzero_d(quarterpi, mask1);
2580 y = gmx_simd_blendv_d(y, halfpi, mask2);
2581 xabs = gmx_simd_blendv_d(xabs, t1, mask1);
2582 xabs = gmx_simd_blendv_d(xabs, t2, mask2);
2584 z = gmx_simd_mul_d(xabs, xabs);
2585 z2 = gmx_simd_mul_d(z, z);
2587 P_A = gmx_simd_mul_d(P4, z2);
2588 P_B = gmx_simd_mul_d(P3, z2);
2589 P_A = gmx_simd_add_d(P_A, P2);
2590 P_B = gmx_simd_add_d(P_B, P1);
2591 P_A = gmx_simd_mul_d(P_A, z2);
2592 P_B = gmx_simd_mul_d(P_B, z);
2593 P_A = gmx_simd_add_d(P_A, P0);
2594 P_A = gmx_simd_add_d(P_A, P_B);
2597 Q_B = gmx_simd_mul_d(Q4, z2);
2598 Q_A = gmx_simd_add_d(z2, Q3);
2599 Q_B = gmx_simd_add_d(Q_B, Q2);
2600 Q_A = gmx_simd_mul_d(Q_A, z2);
2601 Q_B = gmx_simd_mul_d(Q_B, z2);
2602 Q_A = gmx_simd_add_d(Q_A, Q1);
2603 Q_B = gmx_simd_add_d(Q_B, Q0);
2604 Q_A = gmx_simd_mul_d(Q_A, z);
2605 Q_A = gmx_simd_add_d(Q_A, Q_B);
2607 z = gmx_simd_mul_d(z, P_A);
2608 z = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
2609 z = gmx_simd_mul_d(z, xabs);
2610 z = gmx_simd_add_d(z, xabs);
2612 t1 = gmx_simd_blendzero_d(morebits1, mask1);
2613 t1 = gmx_simd_blendv_d(t1, morebits2, mask2);
2615 z = gmx_simd_add_d(z, t1);
2616 y = gmx_simd_add_d(y, z);
2618 y = gmx_simd_xor_sign_d(y, x);
2623 /*! \brief SIMD double atan2(y,x).
2625 * \copydetails gmx_simd_atan2_f
2627 static gmx_inline gmx_simd_double_t gmx_simdcall
2628 gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
2630 const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
2631 const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
2632 gmx_simd_double_t xinv, p, aoffset;
2633 gmx_simd_dbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
2635 mask_x0 = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
2636 mask_y0 = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
2637 mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
2638 mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
2640 aoffset = gmx_simd_blendzero_d(halfpi, mask_x0);
2641 aoffset = gmx_simd_blendnotzero_d(aoffset, mask_y0);
2643 aoffset = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
2644 aoffset = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
2646 xinv = gmx_simd_blendnotzero_d(gmx_simd_inv_notmaskfpe_d(x, mask_x0), mask_x0);
2647 p = gmx_simd_mul_d(y, xinv);
2648 p = gmx_simd_atan_d(p);
2649 p = gmx_simd_add_d(p, aoffset);
2655 /*! \brief Calculate the force correction due to PME analytically for SIMD double.
2657 * \copydetails gmx_simd_pmecorrF_f
2659 static gmx_inline gmx_simd_double_t gmx_simdcall
2660 gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
2662 const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
2663 const gmx_simd_double_t FN9 = gmx_simd_set1_d(1.1859116242260148027e-11);
2664 const gmx_simd_double_t FN8 = gmx_simd_set1_d(-8.1490406329798423616e-10);
2665 const gmx_simd_double_t FN7 = gmx_simd_set1_d(3.4404793543907847655e-8);
2666 const gmx_simd_double_t FN6 = gmx_simd_set1_d(-9.9471420832602741006e-7);
2667 const gmx_simd_double_t FN5 = gmx_simd_set1_d(0.000020740315999115847456);
2668 const gmx_simd_double_t FN4 = gmx_simd_set1_d(-0.00031991745139313364005);
2669 const gmx_simd_double_t FN3 = gmx_simd_set1_d(0.0035074449373659008203);
2670 const gmx_simd_double_t FN2 = gmx_simd_set1_d(-0.031750380176100813405);
2671 const gmx_simd_double_t FN1 = gmx_simd_set1_d(0.13884101728898463426);
2672 const gmx_simd_double_t FN0 = gmx_simd_set1_d(-0.75225277815249618847);
2674 const gmx_simd_double_t FD5 = gmx_simd_set1_d(0.000016009278224355026701);
2675 const gmx_simd_double_t FD4 = gmx_simd_set1_d(0.00051055686934806966046);
2676 const gmx_simd_double_t FD3 = gmx_simd_set1_d(0.0081803507497974289008);
2677 const gmx_simd_double_t FD2 = gmx_simd_set1_d(0.077181146026670287235);
2678 const gmx_simd_double_t FD1 = gmx_simd_set1_d(0.41543303143712535988);
2679 const gmx_simd_double_t FD0 = gmx_simd_set1_d(1.0);
2681 gmx_simd_double_t z4;
2682 gmx_simd_double_t polyFN0, polyFN1, polyFD0, polyFD1;
2684 z4 = gmx_simd_mul_d(z2, z2);
2686 polyFD1 = gmx_simd_fmadd_d(FD5, z4, FD3);
2687 polyFD1 = gmx_simd_fmadd_d(polyFD1, z4, FD1);
2688 polyFD1 = gmx_simd_mul_d(polyFD1, z2);
2689 polyFD0 = gmx_simd_fmadd_d(FD4, z4, FD2);
2690 polyFD0 = gmx_simd_fmadd_d(polyFD0, z4, FD0);
2691 polyFD0 = gmx_simd_add_d(polyFD0, polyFD1);
2693 polyFD0 = gmx_simd_inv_d(polyFD0);
2695 polyFN0 = gmx_simd_fmadd_d(FN10, z4, FN8);
2696 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN6);
2697 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN4);
2698 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN2);
2699 polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN0);
2700 polyFN1 = gmx_simd_fmadd_d(FN9, z4, FN7);
2701 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN5);
2702 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN3);
2703 polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN1);
2704 polyFN0 = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
2707 return gmx_simd_mul_d(polyFN0, polyFD0);
2712 /*! \brief Calculate the potential correction due to PME analytically for SIMD double.
2714 * \copydetails gmx_simd_pmecorrV_f
2716 static gmx_inline gmx_simd_double_t gmx_simdcall
2717 gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
2719 const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
2720 const gmx_simd_double_t VN8 = gmx_simd_set1_d(1.2280156762674215741e-10);
2721 const gmx_simd_double_t VN7 = gmx_simd_set1_d(-7.3562157912251309487e-9);
2722 const gmx_simd_double_t VN6 = gmx_simd_set1_d(2.6215886208032517509e-7);
2723 const gmx_simd_double_t VN5 = gmx_simd_set1_d(-4.9532491651265819499e-6);
2724 const gmx_simd_double_t VN4 = gmx_simd_set1_d(0.00025907400778966060389);
2725 const gmx_simd_double_t VN3 = gmx_simd_set1_d(0.0010585044856156469792);
2726 const gmx_simd_double_t VN2 = gmx_simd_set1_d(0.045247661136833092885);
2727 const gmx_simd_double_t VN1 = gmx_simd_set1_d(0.11643931522926034421);
2728 const gmx_simd_double_t VN0 = gmx_simd_set1_d(1.1283791671726767970);
2730 const gmx_simd_double_t VD5 = gmx_simd_set1_d(0.000021784709867336150342);
2731 const gmx_simd_double_t VD4 = gmx_simd_set1_d(0.00064293662010911388448);
2732 const gmx_simd_double_t VD3 = gmx_simd_set1_d(0.0096311444822588683504);
2733 const gmx_simd_double_t VD2 = gmx_simd_set1_d(0.085608012351550627051);
2734 const gmx_simd_double_t VD1 = gmx_simd_set1_d(0.43652499166614811084);
2735 const gmx_simd_double_t VD0 = gmx_simd_set1_d(1.0);
2737 gmx_simd_double_t z4;
2738 gmx_simd_double_t polyVN0, polyVN1, polyVD0, polyVD1;
2740 z4 = gmx_simd_mul_d(z2, z2);
2742 polyVD1 = gmx_simd_fmadd_d(VD5, z4, VD3);
2743 polyVD0 = gmx_simd_fmadd_d(VD4, z4, VD2);
2744 polyVD1 = gmx_simd_fmadd_d(polyVD1, z4, VD1);
2745 polyVD0 = gmx_simd_fmadd_d(polyVD0, z4, VD0);
2746 polyVD0 = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
2748 polyVD0 = gmx_simd_inv_d(polyVD0);
2750 polyVN1 = gmx_simd_fmadd_d(VN9, z4, VN7);
2751 polyVN0 = gmx_simd_fmadd_d(VN8, z4, VN6);
2752 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN5);
2753 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN4);
2754 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN3);
2755 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN2);
2756 polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN1);
2757 polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN0);
2758 polyVN0 = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
2760 return gmx_simd_mul_d(polyVN0, polyVD0);
2768 /*! \name SIMD4 math functions
2770 * \note Only a subset of the math functions are implemented for SIMD4.
2775 #ifdef GMX_SIMD4_HAVE_FLOAT
2777 /*************************************************************************
2778 * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
2779 *************************************************************************/
2781 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
2783 * \copydetails gmx_simd_sum4_f
2785 static gmx_inline gmx_simd4_float_t gmx_simdcall
2786 gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
2787 gmx_simd4_float_t c, gmx_simd4_float_t d)
2789 return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
2792 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
2794 * \copydetails gmx_simd_rsqrt_iter_f
2796 static gmx_inline gmx_simd4_float_t gmx_simdcall
2797 gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
2799 # ifdef GMX_SIMD_HAVE_FMA
2800 return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
2802 return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
2806 /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
2808 * \copydetails gmx_simd_invsqrt_f
2810 static gmx_inline gmx_simd4_float_t gmx_simdcall
2811 gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
2813 gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
2814 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2815 lu = gmx_simd4_rsqrt_iter_f(lu, x);
2817 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2818 lu = gmx_simd4_rsqrt_iter_f(lu, x);
2820 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
2821 lu = gmx_simd4_rsqrt_iter_f(lu, x);
2826 #endif /* GMX_SIMD4_HAVE_FLOAT */
2830 #ifdef GMX_SIMD4_HAVE_DOUBLE
2831 /*************************************************************************
2832 * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
2833 *************************************************************************/
2836 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
2838 * \copydetails gmx_simd_sum4_f
2840 static gmx_inline gmx_simd4_double_t gmx_simdcall
2841 gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
2842 gmx_simd4_double_t c, gmx_simd4_double_t d)
2844 return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
2847 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
2849 * \copydetails gmx_simd_rsqrt_iter_f
2851 static gmx_inline gmx_simd4_double_t gmx_simdcall
2852 gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
2854 #ifdef GMX_SIMD_HAVE_FMA
2855 return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
2857 return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
2861 /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
2863 * \copydetails gmx_simd_invsqrt_f
2865 static gmx_inline gmx_simd4_double_t gmx_simdcall
2866 gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
2868 gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
2869 #if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2870 lu = gmx_simd4_rsqrt_iter_d(lu, x);
2872 #if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2873 lu = gmx_simd4_rsqrt_iter_d(lu, x);
2875 #if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2876 lu = gmx_simd4_rsqrt_iter_d(lu, x);
2878 #if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
2879 lu = gmx_simd4_rsqrt_iter_d(lu, x);
2883 #endif /* GMX_SIMD4_HAVE_DOUBLE */
2888 /* Set defines based on default Gromacs precision */
2890 /* Documentation in single branch below */
2891 # define gmx_simd_sum4_r gmx_simd_sum4_d
2892 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_d
2893 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_d
2894 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_d
2895 # define gmx_simd_sqrt_r gmx_simd_sqrt_d
2896 # define gmx_simd_inv_r gmx_simd_inv_d
2897 # define gmx_simd_log_r gmx_simd_log_d
2898 # define gmx_simd_exp2_r gmx_simd_exp2_d
2899 # define gmx_simd_exp_r gmx_simd_exp_d
2900 # define gmx_simd_erf_r gmx_simd_erf_d
2901 # define gmx_simd_erfc_r gmx_simd_erfc_d
2902 # define gmx_simd_sincos_r gmx_simd_sincos_d
2903 # define gmx_simd_sin_r gmx_simd_sin_d
2904 # define gmx_simd_cos_r gmx_simd_cos_d
2905 # define gmx_simd_tan_r gmx_simd_tan_d
2906 # define gmx_simd_asin_r gmx_simd_asin_d
2907 # define gmx_simd_acos_r gmx_simd_acos_d
2908 # define gmx_simd_atan_r gmx_simd_atan_d
2909 # define gmx_simd_atan2_r gmx_simd_atan2_d
2910 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_d
2911 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_d
2912 # define gmx_simd4_sum4_r gmx_simd4_sum4_d
2913 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_d
2915 #else /* GMX_DOUBLE */
2917 /*! \name Real-precision SIMD math functions
2919 * These are the ones you should typically call in Gromacs.
2923 /*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
2925 * \copydetails gmx_simd_sum4_f
2927 # define gmx_simd_sum4_r gmx_simd_sum4_f
2929 /*! \brief Return -a if b is negative, SIMD real.
2931 * \copydetails gmx_simd_xor_sign_f
2933 # define gmx_simd_xor_sign_r gmx_simd_xor_sign_f
2935 /*! \brief Calculate 1/sqrt(x) for SIMD real.
2937 * \copydetails gmx_simd_invsqrt_f
2939 # define gmx_simd_invsqrt_r gmx_simd_invsqrt_f
2941 /*! \brief Calculate 1/sqrt(x) for two SIMD reals.
2943 * \copydetails gmx_simd_invsqrt_pair_f
2945 # define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_f
2947 /*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
2949 * \copydetails gmx_simd_sqrt_f
2951 # define gmx_simd_sqrt_r gmx_simd_sqrt_f
2953 /*! \brief Calculate 1/x for SIMD real.
2955 * \copydetails gmx_simd_inv_f
2957 # define gmx_simd_inv_r gmx_simd_inv_f
2959 /*! \brief SIMD real log(x). This is the natural logarithm.
2961 * \copydetails gmx_simd_log_f
2963 # define gmx_simd_log_r gmx_simd_log_f
2965 /*! \brief SIMD real 2^x.
2967 * \copydetails gmx_simd_exp2_f
2969 # define gmx_simd_exp2_r gmx_simd_exp2_f
2971 /*! \brief SIMD real e^x.
2973 * \copydetails gmx_simd_exp_f
2975 # define gmx_simd_exp_r gmx_simd_exp_f
2977 /*! \brief SIMD real erf(x).
2979 * \copydetails gmx_simd_erf_f
2981 # define gmx_simd_erf_r gmx_simd_erf_f
2983 /*! \brief SIMD real erfc(x).
2985 * \copydetails gmx_simd_erfc_f
2987 # define gmx_simd_erfc_r gmx_simd_erfc_f
2989 /*! \brief SIMD real sin \& cos.
2991 * \copydetails gmx_simd_sincos_f
2993 # define gmx_simd_sincos_r gmx_simd_sincos_f
2995 /*! \brief SIMD real sin(x).
2997 * \copydetails gmx_simd_sin_f
2999 # define gmx_simd_sin_r gmx_simd_sin_f
3001 /*! \brief SIMD real cos(x).
3003 * \copydetails gmx_simd_cos_f
3005 # define gmx_simd_cos_r gmx_simd_cos_f
3007 /*! \brief SIMD real tan(x).
3009 * \copydetails gmx_simd_tan_f
3011 # define gmx_simd_tan_r gmx_simd_tan_f
3013 /*! \brief SIMD real asin(x).
3015 * \copydetails gmx_simd_asin_f
3017 # define gmx_simd_asin_r gmx_simd_asin_f
3019 /*! \brief SIMD real acos(x).
3021 * \copydetails gmx_simd_acos_f
3023 # define gmx_simd_acos_r gmx_simd_acos_f
3025 /*! \brief SIMD real atan(x).
3027 * \copydetails gmx_simd_atan_f
3029 # define gmx_simd_atan_r gmx_simd_atan_f
3031 /*! \brief SIMD real atan2(y,x).
3033 * \copydetails gmx_simd_atan2_f
3035 # define gmx_simd_atan2_r gmx_simd_atan2_f
3037 /*! \brief SIMD Analytic PME force correction.
3039 * \copydetails gmx_simd_pmecorrF_f
3041 # define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_f
3043 /*! \brief SIMD Analytic PME potential correction.
3045 * \copydetails gmx_simd_pmecorrV_f
3047 # define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_f
3050 * \name SIMD4 math functions
3054 /*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
3056 * \copydetails gmx_simd_sum4_f
3058 # define gmx_simd4_sum4_r gmx_simd4_sum4_f
3060 /*! \brief Calculate 1/sqrt(x) for SIMD4 real.
3062 * \copydetails gmx_simd_invsqrt_f
3064 # define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_f
3068 #endif /* GMX_DOUBLE */
3073 #endif /* GMX_SIMD_SIMD_MATH_H_ */