* \ingroup module_simd
*/
+#include "config.h"
+
#include <math.h>
#include "gromacs/math/utilities.h"
* \param d term 4 (multiple values)
* \return sum of terms 1-4 (multiple values)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
gmx_simd_float_t c, gmx_simd_float_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
- return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(-0.0), b));
+ return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), b));
#else
return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
#endif
}
+#ifndef gmx_simd_rsqrt_iter_f
/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
*
* This is a low-level routine that should only be used by SIMD math routine
* \param x The reference (starting) value x for which we want 1/sqrt(x).
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
# endif
}
+#endif
/*! \brief Calculate 1/sqrt(x) for SIMD float.
*
* \param x Argument that must be >0. This routine does not check arguments.
* \return 1/sqrt(x). Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_invsqrt_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
* In particular for double precision we can sometimes calculate square root
* pairs slightly faster by using single precision until the very last step.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
gmx_simd_float_t *out0, gmx_simd_float_t *out1)
{
*out1 = gmx_simd_invsqrt_f(x1);
}
+#ifndef gmx_simd_rcp_iter_f
/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
*
* This is a low-level routine that should only be used by SIMD math routine
* \param x The reference (starting) value x for which we want 1/x.
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
}
+#endif
/*! \brief Calculate 1/x for SIMD float.
*
* \param x Argument that must be nonzero. This routine does not check arguments.
* \return 1/x. Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_inv_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rcp_f(x);
* \return sqrt(x). If x=0, the result will correctly be set to 0.
* The result is undefined if the input value is negative.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sqrt_f(gmx_simd_float_t x)
{
gmx_simd_fbool_t mask;
* \result The natural logarithm of x. Undefined if argument is invalid.
*/
#ifndef gmx_simd_log_f
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_log_f(gmx_simd_float_t x)
{
const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
* \param x Argument.
* \result 2^x. Undefined if input argument caused overflow.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp2_f(gmx_simd_float_t x)
{
/* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
* extended precision arithmetics to improve accuracy.
*
* \param x Argument.
- * \result exp(x). Undefined if input argument caused overflow.
+ * \result exp(x). Undefined if input argument caused overflow,
+ * which can happen if abs(x) \> 7e13.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
* This routine achieves very close to full precision, but we do not care about
* the last bit or the subnormal result range.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erf_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* (think results that are in the ballpark of 10^-30 for single precision,
* or 10^-200 for double) since that is not relevant for MD.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erfc_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* magnitudes of the argument we inherently begin to lose accuracy due to the
* argument reduction, despite using extended precision arithmetics internally.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
y = gmx_simd_round_f(z);
mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
- ssign = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
- csign = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
+ ssign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
+ csign = gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
#else
const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
const gmx_simd_float_t minusquarter = gmx_simd_set1_f(-0.25f);
* active or inactive - you will get errors if only one is used.
*/
# ifdef GMX_SIMD_HAVE_LOGICAL
- ssign = gmx_simd_and_f(ssign, gmx_simd_set1_f(-0.0f));
- csign = gmx_simd_andnot_f(q, gmx_simd_set1_f(-0.0f));
+ ssign = gmx_simd_and_f(ssign, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
+ csign = gmx_simd_andnot_f(q, gmx_simd_set1_f(GMX_FLOAT_NEGZERO));
ssign = gmx_simd_xor_f(ssign, csign);
# else
csign = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sin_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_cos_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \param x The argument to evaluate tan for
* \result Tan(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_tan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125);
x = gmx_simd_fnmadd_f(y, argred1, x);
x = gmx_simd_fnmadd_f(y, argred2, x);
x = gmx_simd_fnmadd_f(y, argred3, x);
- x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), mask), x);
+ x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
#else
const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
* \param x The argument to evaluate asin for
* \result Asin(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_asin_f(gmx_simd_float_t x)
{
const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
* \param x The argument to evaluate acos for
* \result Acos(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_acos_f(gmx_simd_float_t x)
{
const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
* \param x The argument to evaluate atan for
* \result Atan(x), same argument/value range as standard math library.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
* of any concern in Gromacs, and in particular it will not affect calculations
* of angles from vectors.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
{
const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
* that we can leave out of this routine.
*
* For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
- * in this range!
+ * the argument \f$beta r\f$ will be in the range 0.15 to ~4, which is
+ * the range used for the minimax fit. Use your favorite plotting program
+ * to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is in this range!
*
* We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
* However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
* with the vector connecting the two particles and you have your
* vectorial force to add to the particles.
*
- * This approximation achieves an accuracy slightly lower than 1e-6; when
- * added to \f$1/r\f$ the error will be insignificant.
+ * This approximation achieves an error slightly lower than 1e-6
+ * in single precision and 1e-11 in double precision
+ * for arguments smaller than 16 (\f$\beta r \leq 4 \f$);
+ * when added to \f$1/r\f$ the error will be insignificant.
+ * For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
*
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
* 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
* and you have your potential.
*
- * This approximation achieves an accuracy slightly lower than 1e-6; when
- * added to \f$1/r\f$ the error will be insignificant.
+ * This approximation achieves an error slightly lower than 1e-6
+ * in single precision and 4e-11 in double precision
+ * for arguments smaller than 16 (\f$ 0.15 \leq \beta r \leq 4 \f$);
+ * for \f$ \beta r \leq 0.15\f$ the error can be twice as high;
+ * when added to \f$1/r\f$ the error will be insignificant.
+ * For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
gmx_simd_double_t c, gmx_simd_double_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
- return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(-0.0), b));
+ return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), b));
#else
return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
#endif
}
+#ifndef gmx_simd_rsqrt_iter_d
/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
#endif
}
-
+#endif
/*! \brief Calculate 1/sqrt(x) for SIMD double
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_invsqrt_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
*
* \copydetails gmx_simd_invsqrt_pair_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
gmx_simd_double_t *out0, gmx_simd_double_t *out1)
{
#endif
}
+#ifndef gmx_simd_rcp_iter_d
/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
*
* \copydetails gmx_simd_rcp_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
}
+#endif
/*! \brief Calculate 1/x for SIMD double.
*
* \copydetails gmx_simd_inv_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_inv_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rcp_d(x);
*
* \copydetails gmx_simd_sqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sqrt_d(gmx_simd_double_t x)
{
gmx_simd_dbool_t mask;
*
* \copydetails gmx_simd_log_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_log_d(gmx_simd_double_t x)
{
const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
*
* \copydetails gmx_simd_exp2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp2_d(gmx_simd_double_t x)
{
const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
*
* \copydetails gmx_simd_exp_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
*
* \copydetails gmx_simd_erf_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erf_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_erfc_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erfc_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_sincos_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
y = gmx_simd_round_d(z);
mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
- ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
- csign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
+ ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
+ csign = gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
#else
const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
const gmx_simd_double_t minusquarter = gmx_simd_set1_d(-0.25);
* active or inactive - you will get errors if only one is used.
*/
# ifdef GMX_SIMD_HAVE_LOGICAL
- ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(-0.0));
- csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(-0.0));
+ ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
+ csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(GMX_DOUBLE_NEGZERO));
ssign = gmx_simd_xor_d(ssign, csign);
# else
csign = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
*
* \copydetails gmx_simd_sin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sin_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_cos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_cos_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_tan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_tan_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
x = gmx_simd_fnmadd_d(y, argred1, x);
x = gmx_simd_fnmadd_d(y, argred2, x);
x = gmx_simd_fnmadd_d(y, argred3, x);
- x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask), x);
+ x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
#else
const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
*
* \copydetails gmx_simd_asin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_asin_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_acos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_acos_d(gmx_simd_double_t x)
{
const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
*
* \copydetails gmx_simd_atan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_atan2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
{
const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
*
* \copydetails gmx_simd_pmecorrF_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
*
* \copydetails gmx_simd_pmecorrV_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
gmx_simd4_float_t c, gmx_simd4_float_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
{
gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
gmx_simd4_double_t c, gmx_simd4_double_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
{
gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);