Merge branch 'release-2016'

[alexxy/gromacs.git] / src / gromacs / simd / simd_math.h
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h

index b11e0edc9bfa1604061db11e254afecd207ee1e9..e3f1db68258b361d1753d6ceedca8686b32a53b7 100644 (file)
--- a/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@ -131,7 +131,15 @@ rsqrtIter(SimdFloat lu, SimdFloat x)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD float.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline SimdFloat gmx_simdcall
@@ -152,13 +160,22 @@ invsqrt(SimdFloat x)
  
  /*! \brief Calculate 1/sqrt(x) for two SIMD floats.
   *
- * \param x0  First set of arguments, x0 must be positive - no argument checking.
- * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ * \param x0  First set of arguments, x0 must be in single range (see below).
+ * \param x1  Second set of arguments, x1 must be in single range (see below).
   * \param[out] out0  Result 1/sqrt(x0)
   * \param[out] out1  Result 1/sqrt(x1)
   *
   *  In particular for double precision we can sometimes calculate square root
   *  pairs slightly faster by using single precision until the very last step.
+ *
+ * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *       For the single precision implementation this is obviously always
+ *       true for positive values, but for double precision it adds an
+ *       extra restriction since the first lookup step might have to be
+ *       performed in single precision on some architectures. Note that the
+ *       responsibility for checking falls on you - this routine does not
+ *       check arguments.
   */
  static inline void gmx_simdcall
  invsqrtPair(SimdFloat x0,    SimdFloat x1,
@@ -187,7 +204,15 @@ rcpIter(SimdFloat lu, SimdFloat x)
  
  /*! \brief Calculate 1/x for SIMD float.
   *
- *  \param x Argument that must be nonzero. This routine does not check arguments.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/x. Result is undefined if your argument was invalid.
   */
  static inline SimdFloat gmx_simdcall
@@ -209,7 +234,12 @@ inv(SimdFloat x)
  /*! \brief Division for SIMD floats
   *
   * \param nom    Nominator
- * \param denom  Denominator
+ * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
+ *               For single precision this is equivalent to a nonzero argument,
+ *               but in double precision it adds an extra restriction since
+ *               the first lookup step might have to be performed in single
+ *               precision on some architectures. Note that the responsibility
+ *               for checking falls on you - this routine does not check arguments.
   *
   * \return nom/denom
   *
@@ -228,7 +258,9 @@ operator/(SimdFloat nom, SimdFloat denom)
   *  Illegal values in the masked-out elements will not lead to
   *  floating-point exceptions.
   *
- *  \param x Argument that must be >0 for masked-in entries
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX for masked-in entries.
+ *           See \ref invsqrt for the discussion about argument restrictions.
   *  \param m Mask
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
   *          entry was not masked, and 0.0 for masked-out entries.
@@ -251,7 +283,9 @@ maskzInvsqrt(SimdFloat x, SimdFBool m)
  
  /*! \brief Calculate 1/x for SIMD float, masked version.
   *
- *  \param x Argument that must be nonzero for non-masked entries.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX for masked-in entries.
+ *           See \ref invsqrt for the discussion about argument restrictions.
   *  \param m Mask
   *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
   */
@@ -271,17 +305,37 @@ maskzInv(SimdFloat x, SimdFBool m)
      return lu;
  }
  
-/*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
+/*! \brief Calculate sqrt(x) for SIMD floats
+ *
+ *  \tparam opt By default, this function checks if the input value is 0.0
+ *              and masks this to return the correct result. If you are certain
+ *              your argument will never be zero, and you know you need to save
+ *              every single cycle you can, you can alternatively call the
+ *              function as sqrt<MathOptimization::Unsafe>(x).
+ *
+ *  \param  x   Argument that must be in range 0 <=x <= GMX_FLOAT_MAX, since the
+ *              lookup step often has to be implemented in single precision.
+ *              Arguments smaller than GMX_FLOAT_MIN will always lead to a zero
+ *              result, even in double precision. If you are using the unsafe
+ *              math optimization parameter, the argument must be in the range
+ *              GMX_FLOAT_MIN <= x <= GMX_FLOAT_MAX.
   *
- *  \param x Argument that must be >=0.
- *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- *          The result is undefined if the input value is negative.
+ *  \return sqrt(x). The result is undefined if the input value does not fall
+ *          in the allowed range specified for the argument.
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  sqrt(SimdFloat x)
  {
-    SimdFloat  res = maskzInvsqrt(x, setZero() < x);
-    return res*x;
+    if (opt == MathOptimization::Safe)
+    {
+        SimdFloat  res = maskzInvsqrt(x, setZero() < x);
+        return res*x;
+    }
+    else
+    {
+        return x * invsqrt(x);
+    }
  }
  
  #if !GMX_SIMD_HAVE_NATIVE_LOG_FLOAT
@@ -328,16 +382,44 @@ log(SimdFloat x)
  #endif
  
  #if !GMX_SIMD_HAVE_NATIVE_EXP2_FLOAT
-/*! \brief SIMD float 2^x.
- *
- * \param x Argument.
- * \result 2^x. Undefined if input argument caused overflow.
+/*! \brief SIMD float 2^x
+ *
+ * \tparam opt If this is changed from the default (safe) into the unsafe
+ *             option, input values that would otherwise lead to zero-clamped
+ *             results are not allowed and will lead to undefined results.
+ *
+ * \param x Argument. For the default (safe) function version this can be
+ *          arbitrarily small value, but the routine might clamp the result to
+ *          zero for arguments that would produce subnormal IEEE754-2008 results.
+ *          This corresponds to inputs below -126 in single or -1022 in double,
+ *          and it might overflow for arguments reaching 127 (single) or
+ *          1023 (double). If you enable the unsafe math optimization,
+ *          very small arguments will not necessarily be zero-clamped, but
+ *          can produce undefined results.
+ *
+ * \result 2^x. The result is undefined for very large arguments that cause
+ *          internal floating-point overflow. If unsafe optimizations are enabled,
+ *          this is also true for very small values.
+ *
+ * \note    The definition range of this function is just-so-slightly smaller
+ *          than the allowed IEEE exponents for many architectures. This is due
+ *          to the implementation, which will hopefully improve in the future.
+ *
+ * \warning You cannot rely on this implementation returning inf for arguments
+ *          that cause overflow. If you have some very large
+ *          values and need to rely on getting a valid numerical output,
+ *          take the minimum of your variable and the largest valid argument
+ *          before calling this routine.
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  exp2(SimdFloat x)
  {
-    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
-    const SimdFloat  arglimit(126.0f);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
+    // Note that we use a slightly wider range inside this *implementation* compared
+    // to that guaranteed by the API documentation in the comment above (which has
+    // to cover all architectures).
+    const SimdFloat  smallArgLimit(-127.0f);
      const SimdFloat  CC6(0.0001534581200287996416911311f);
      const SimdFloat  CC5(0.001339993121934088894618990f);
      const SimdFloat  CC4(0.009618488957115180159497841f);
@@ -349,12 +431,30 @@ exp2(SimdFloat x)
      SimdFloat        intpart;
      SimdFloat        fexppart;
      SimdFloat        p;
-    SimdFBool        m;
+
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the exponents reaches -127, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -127. At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
  
      fexppart  = ldexp(one, cvtR2I(x));
      intpart   = round(x);
-    m         = abs(x) <= arglimit;
-    fexppart  = selectByMask(fexppart, m);
      x         = x - intpart;
  
      p         = fma(CC6, x, CC5);
@@ -374,16 +474,44 @@ exp2(SimdFloat x)
   * In addition to scaling the argument for 2^x this routine correctly does
   * extended precision arithmetics to improve accuracy.
   *
- * \param x Argument.
- * \result exp(x). Undefined if input argument caused overflow,
- * which can happen if abs(x) \> 7e13.
+ * \tparam opt If this is changed from the default (safe) into the unsafe
+ *             option, input values that would otherwise lead to zero-clamped
+ *             results are not allowed and will lead to undefined results.
+ *
+ * \param x Argument. For the default (safe) function version this can be
+ *          arbitrarily small value, but the routine might clamp the result to
+ *          zero for arguments that would produce subnormal IEEE754-2008 results.
+ *          This corresponds to input arguments reaching
+ *          -126*ln(2)=-87.3 in single, or -1022*ln(2)=-708.4 (double).
+ *          Similarly, it might overflow for arguments reaching
+ *          127*ln(2)=88.0 (single) or 1023*ln(2)=709.1 (double). If the
+ *          unsafe math optimizations are enabled, small input values that would
+ *          result in zero-clamped output are not allowed.
+ *
+ * \result exp(x). Overflowing arguments are likely to either return 0 or inf,
+ *         depending on the underlying implementation. If unsafe optimizations
+ *         are enabled, this is also true for very small values.
+ *
+ * \note    The definition range of this function is just-so-slightly smaller
+ *          than the allowed IEEE exponents for many architectures. This is due
+ *          to the implementation, which will hopefully improve in the future.
+ *
+ * \warning You cannot rely on this implementation returning inf for arguments
+ *          that cause overflow. If you have some very large
+ *          values and need to rely on getting a valid numerical output,
+ *          take the minimum of your variable and the largest valid argument
+ *          before calling this routine.
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  exp(SimdFloat x)
  {
      const SimdFloat  argscale(1.44269504088896341f);
-    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
-    const SimdFloat  arglimit(126.0f);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -127.
+    // Note that we use a slightly wider range inside this *implementation* compared
+    // to that guaranteed by the API documentation in the comment above (which has
+    // to cover all architectures).
+    const SimdFloat  smallArgLimit(-88.0296919311f);
      const SimdFloat  invargscale0(-0.693145751953125f);
      const SimdFloat  invargscale1(-1.428606765330187045e-06f);
      const SimdFloat  CC4(0.00136324646882712841033936f);
@@ -395,13 +523,31 @@ exp(SimdFloat x)
      SimdFloat        fexppart;
      SimdFloat        intpart;
      SimdFloat        y, p;
-    SimdFBool        m;
+
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the IEEE exponents reaches -127, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -127*ln(2). At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
  
      y         = x * argscale;
      fexppart  = ldexp(one, cvtR2I(y));
      intpart   = round(y);
-    m         = (abs(y) <= arglimit);
-    fexppart  = selectByMask(fexppart, m);
  
      // Extended precision arithmetics
      x         = fma(invargscale0, intpart, x);
@@ -1357,7 +1503,15 @@ rsqrtIter(SimdDouble lu, SimdDouble x)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD double.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline SimdDouble gmx_simdcall
@@ -1381,13 +1535,22 @@ invsqrt(SimdDouble x)
  
  /*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
   *
- * \param x0  First set of arguments, x0 must be positive - no argument checking.
- * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ * \param x0  First set of arguments, x0 must be in single range (see below).
+ * \param x1  Second set of arguments, x1 must be in single range (see below).
   * \param[out] out0  Result 1/sqrt(x0)
   * \param[out] out1  Result 1/sqrt(x1)
   *
   *  In particular for double precision we can sometimes calculate square root
   *  pairs slightly faster by using single precision until the very last step.
+ *
+ * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *       For the single precision implementation this is obviously always
+ *       true for positive values, but for double precision it adds an
+ *       extra restriction since the first lookup step might have to be
+ *       performed in single precision on some architectures. Note that the
+ *       responsibility for checking falls on you - this routine does not
+ *       check arguments.
   */
  static inline void gmx_simdcall
  invsqrtPair(SimdDouble x0,    SimdDouble x1,
@@ -1444,7 +1607,15 @@ rcpIter(SimdDouble lu, SimdDouble x)
  
  /*! \brief Calculate 1/x for SIMD double.
   *
- *  \param x Argument that must be nonzero. This routine does not check arguments.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/x. Result is undefined if your argument was invalid.
   */
  static inline SimdDouble gmx_simdcall
@@ -1469,7 +1640,12 @@ inv(SimdDouble x)
  /*! \brief Division for SIMD doubles
   *
   * \param nom    Nominator
- * \param denom  Denominator
+ * \param denom  Denominator, with magnitude in range (GMX_FLOAT_MIN,GMX_FLOAT_MAX).
+ *               For single precision this is equivalent to a nonzero argument,
+ *               but in double precision it adds an extra restriction since
+ *               the first lookup step might have to be performed in single
+ *               precision on some architectures. Note that the responsibility
+ *               for checking falls on you - this routine does not check arguments.
   *
   * \return nom/denom
   *
@@ -1489,7 +1665,9 @@ operator/(SimdDouble nom, SimdDouble denom)
   *  Illegal values in the masked-out elements will not lead to
   *  floating-point exceptions.
   *
- *  \param x Argument that must be >0 for masked-in entries
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX for masked-in entries.
+ *           See \ref invsqrt for the discussion about argument restrictions.
   *  \param m Mask
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
   *          entry was not masked, and 0.0 for masked-out entries.
@@ -1515,7 +1693,9 @@ maskzInvsqrt(SimdDouble x, SimdDBool m)
  
  /*! \brief Calculate 1/x for SIMD double, masked version.
   *
- *  \param x Argument that must be nonzero for non-masked entries.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX for masked-in entries.
+ *           See \ref invsqrt for the discussion about argument restrictions.
   *  \param m Mask
   *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
   */
@@ -1539,18 +1719,24 @@ maskzInv(SimdDouble x, SimdDBool m)
  }
  
  
-/*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
+/*! \brief Calculate sqrt(x) for SIMD doubles.
   *
- *  \param x Argument that must be >=0.
- *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- *          if x>0 && x<float_min, the result will incorrectly be set to 0.
- *          The result is undefined if the input value is negative.
+ *  \copydetails sqrt(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  sqrt(SimdDouble x)
  {
-    // As we might use a float version of rsqrt, we mask out small values
-    return x * maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) <= x);
+    if (opt == MathOptimization::Safe)
+    {
+        // As we might use a float version of rsqrt, we mask out small values
+        SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
+        return res*x;
+    }
+    else
+    {
+        return x * invsqrt(x);
+    }
  }
  
  #if !GMX_SIMD_HAVE_NATIVE_LOG_DOUBLE
@@ -1605,13 +1791,14 @@ log(SimdDouble x)
  #if !GMX_SIMD_HAVE_NATIVE_EXP2_DOUBLE
  /*! \brief SIMD double 2^x.
   *
- * \param x Argument.
- * \result 2^x. Undefined if input argument caused overflow.
+ * \copydetails exp2(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  exp2(SimdDouble x)
  {
-    const SimdDouble  arglimit(1022.0);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+    const SimdDouble  smallArgLimit(-1023.0);
      const SimdDouble  CE11(4.435280790452730022081181e-10);
      const SimdDouble  CE10(7.074105630863314448024247e-09);
      const SimdDouble  CE9(1.017819803432096698472621e-07);
@@ -1628,12 +1815,30 @@ exp2(SimdDouble x)
      SimdDouble        intpart;
      SimdDouble        fexppart;
      SimdDouble        p;
-    SimdDBool         m;
+
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -1023. At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
  
      fexppart  = ldexp(one, cvtR2I(x));
      intpart   = round(x);
-    m         = abs(x) <= arglimit;
-    fexppart  = selectByMask(fexppart, m);
      x         = x - intpart;
  
      p         = fma(CE11, x, CE10);
@@ -1655,18 +1860,15 @@ exp2(SimdDouble x)
  #if !GMX_SIMD_HAVE_NATIVE_EXP_DOUBLE
  /*! \brief SIMD double exp(x).
   *
- * In addition to scaling the argument for 2^x this routine correctly does
- * extended precision arithmetics to improve accuracy.
- *
- * \param x Argument.
- * \result exp(x). Undefined if input argument caused overflow,
- * which can happen if abs(x) \> 7e13.
+ * \copydetails exp(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  exp(SimdDouble x)
  {
      const SimdDouble  argscale(1.44269504088896340735992468100);
-    const SimdDouble  arglimit(1022.0);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+    const SimdDouble  smallArgLimit(-709.0895657128);
      const SimdDouble  invargscale0(-0.69314718055966295651160180568695068359375);
      const SimdDouble  invargscale1(-2.8235290563031577122588448175013436025525412068e-13);
      const SimdDouble  CE12(2.078375306791423699350304e-09);
@@ -1684,13 +1886,31 @@ exp(SimdDouble x)
      SimdDouble        fexppart;
      SimdDouble        intpart;
      SimdDouble        y, p;
-    SimdDBool         m;
+
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -1023*ln(2). At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
  
      y         = x * argscale;
      fexppart  = ldexp(one, cvtR2I(y));
      intpart   = round(y);
-    m         = (abs(y) <= arglimit);
-    fexppart  = selectByMask(fexppart, m);
  
      // Extended precision arithmetics
      x         = fma(invargscale0, intpart, x);
@@ -2697,7 +2917,15 @@ pmePotentialCorrection(SimdDouble z2)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD double, but in single accuracy.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline SimdDouble gmx_simdcall
@@ -2722,7 +2950,15 @@ invsqrtSingleAccuracy(SimdDouble x)
   *  Illegal values in the masked-out elements will not lead to
   *  floating-point exceptions.
   *
- *  \param x Argument that must be >0 for masked-in entries
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \param m Mask
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
   *          entry was not masked, and 0.0 for masked-out entries.
@@ -2745,13 +2981,22 @@ maskzInvsqrtSingleAccuracy(SimdDouble x, SimdDBool m)
  
  /*! \brief Calculate 1/sqrt(x) for two SIMD doubles, but single accuracy.
   *
- * \param x0  First set of arguments, x0 must be positive - no argument checking.
- * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ * \param x0  First set of arguments, x0 must be in single range (see below).
+ * \param x1  Second set of arguments, x1 must be in single range (see below).
   * \param[out] out0  Result 1/sqrt(x0)
   * \param[out] out1  Result 1/sqrt(x1)
   *
   *  In particular for double precision we can sometimes calculate square root
   *  pairs slightly faster by using single precision until the very last step.
+ *
+ * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *       For the single precision implementation this is obviously always
+ *       true for positive values, but for double precision it adds an
+ *       extra restriction since the first lookup step might have to be
+ *       performed in single precision on some architectures. Note that the
+ *       responsibility for checking falls on you - this routine does not
+ *       check arguments.
   */
  static inline void gmx_simdcall
  invsqrtPairSingleAccuracy(SimdDouble x0,    SimdDouble x1,
@@ -2783,7 +3028,15 @@ invsqrtPairSingleAccuracy(SimdDouble x0,    SimdDouble x1,
  
  /*! \brief Calculate 1/x for SIMD double, but in single accuracy.
   *
- *  \param x Argument that must be nonzero. This routine does not check arguments.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \return 1/x. Result is undefined if your argument was invalid.
   */
  static inline SimdDouble gmx_simdcall
@@ -2804,7 +3057,15 @@ invSingleAccuracy(SimdDouble x)
  
  /*! \brief 1/x for masked entries of SIMD double, single accuracy.
   *
- *  \param x Argument that must be nonzero for non-masked entries.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *
   *  \param m Mask
   *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
   */
@@ -2825,18 +3086,26 @@ maskzInvSingleAccuracy(SimdDouble x, SimdDBool m)
  }
  
  
-/*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, single accuracy.
+/*! \brief Calculate sqrt(x) (correct for 0.0) for SIMD double, with single accuracy.
   *
- *  \param x Argument that must be >=0.
- *  \return sqrt(x). If x<float_min, the result will correctly be set to 0.
- *          The result is undefined if the input value is negative.
+ *  \copydetails sqrt(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  sqrtSingleAccuracy(SimdDouble x)
  {
-    return x * maskzInvsqrtSingleAccuracy(x, SimdDouble(GMX_FLOAT_MIN) <= x);
+    if (opt == MathOptimization::Safe)
+    {
+        SimdDouble res = maskzInvsqrt(x, SimdDouble(GMX_FLOAT_MIN) < x);
+        return res*x;
+    }
+    else
+    {
+        return x * invsqrtSingleAccuracy(x);
+    }
  }
  
+
  /*! \brief SIMD log(x). Double precision SIMD data, single accuracy.
   *
   * \param x Argument, should be >0.
@@ -2878,16 +3147,16 @@ logSingleAccuracy(SimdDouble x)
      return p;
  }
  
-/*! \brief SIMD 2^x. Double precision SIMD data, single accuracy.
+/*! \brief SIMD 2^x. Double precision SIMD, single accuracy.
   *
- * \param x Argument.
- * \result 2^x. Undefined if input argument caused overflow.
+ * \copydetails exp2(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  exp2SingleAccuracy(SimdDouble x)
  {
-    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127.
-    const SimdDouble  arglimit = SimdDouble(126.0);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+    const SimdDouble  smallArgLimit(-1023.0);
      const SimdDouble  CC6(0.0001534581200287996416911311);
      const SimdDouble  CC5(0.001339993121934088894618990);
      const SimdDouble  CC4(0.009618488957115180159497841);
@@ -2898,12 +3167,31 @@ exp2SingleAccuracy(SimdDouble x)
  
      SimdDouble        intpart;
      SimdDouble        p;
-    SimdDBool         valuemask;
      SimdDInt32        ix;
  
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the exponents reaches -1023, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -1023. At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
+
      ix        = cvtR2I(x);
      intpart   = round(x);
-    valuemask = (abs(x) <= arglimit);
      x         = x - intpart;
  
      p         = fma(CC6, x, CC5);
@@ -2913,22 +3201,23 @@ exp2SingleAccuracy(SimdDouble x)
      p         = fma(p, x, CC1);
      p         = fma(p, x, one);
      x         = ldexp(p, ix);
-    x         = selectByMask(x, valuemask);
  
      return x;
  }
  
-/*! \brief SIMD exp(x). Double precision SIMD data, single accuracy.
+
+
+/*! \brief SIMD exp(x). Double precision SIMD, single accuracy.
   *
- * \param x Argument.
- * \result exp(x). Undefined if input argument caused overflow.
+ * \copydetails exp(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdDouble gmx_simdcall
  expSingleAccuracy(SimdDouble x)
  {
      const SimdDouble  argscale(1.44269504088896341);
-    // Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127
-    const SimdDouble  arglimit(126.0);
+    // Lower bound: Clamp args that would lead to an IEEE fp exponent below -1023.
+    const SimdDouble  smallArgLimit(-709.0895657128);
      const SimdDouble  invargscale(-0.69314718055994528623);
      const SimdDouble  CC4(0.00136324646882712841033936);
      const SimdDouble  CC3(0.00836596917361021041870117);
@@ -2938,13 +3227,32 @@ expSingleAccuracy(SimdDouble x)
      const SimdDouble  one(1.0);
      SimdDouble        intpart;
      SimdDouble        y, p;
-    SimdDBool         valuemask;
      SimdDInt32        iy;
  
+    // Large negative values are valid arguments to exp2(), so there are two
+    // things we need to account for:
+    // 1. When the exponents reaches -127, the (biased) exponent field will be
+    //    zero and we can no longer multiply with it. There are special IEEE
+    //    formats to handle this range, but for now we have to accept that
+    //    we cannot handle those arguments. If input value becomes even more
+    //    negative, it will start to loop and we would end up with invalid
+    //    exponents. Thus, we need to limit or mask this.
+    // 2. For VERY large negative values, we will have problems that the
+    //    subtraction to get the fractional part loses accuracy, and then we
+    //    can end up with overflows in the polynomial.
+    //
+    // For now, we handle this by clamping smaller arguments to -127. At this
+    // point we will already return zero, so we don't need to do anything
+    // extra for the exponent.
+
+    if (opt == MathOptimization::Safe)
+    {
+        x         = max(x, smallArgLimit);
+    }
+
      y         = x * argscale;
      iy        = cvtR2I(y);
      intpart   = round(y);        // use same rounding algorithm here
-    valuemask = (abs(y) <= arglimit);
  
      // Extended precision arithmetics not needed since
      // we have double precision and only need single accuracy.
@@ -2957,10 +3265,10 @@ expSingleAccuracy(SimdDouble x)
      p         = fma(x*x, p, x);
      p         = p + one;
      x         = ldexp(p, iy);
-    x         = selectByMask(x, valuemask);
      return x;
  }
  
+
  /*! \brief SIMD erf(x). Double precision SIMD data, single accuracy.
   *
   * \param x The value to calculate erf(x) for.
@@ -3660,7 +3968,7 @@ atan2SingleAccuracy(SimdDouble y, SimdDouble x)
   * added to \f$1/r\f$ the error will be insignificant.
   *
   */
-static SimdDouble gmx_simdcall
+static inline SimdDouble gmx_simdcall
  pmeForceCorrectionSingleAccuracy(SimdDouble z2)
  {
      const SimdDouble  FN6(-1.7357322914161492954e-8);
@@ -3732,7 +4040,7 @@ pmeForceCorrectionSingleAccuracy(SimdDouble z2)
   * This approximation achieves an accuracy slightly lower than 1e-6; when
   * added to \f$1/r\f$ the error will be insignificant.
   */
-static SimdDouble gmx_simdcall
+static inline SimdDouble gmx_simdcall
  pmePotentialCorrectionSingleAccuracy(SimdDouble z2)
  {
      const SimdDouble  VN6(1.9296833005951166339e-8);
@@ -3805,8 +4113,15 @@ rsqrtIter(Simd4Float lu, Simd4Float x)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD4 float.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
- *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline Simd4Float gmx_simdcall
  invsqrt(Simd4Float x)
@@ -3854,8 +4169,15 @@ rsqrtIter(Simd4Double lu, Simd4Double x)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD4 double.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
- *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline Simd4Double gmx_simdcall
  invsqrt(Simd4Double x)
@@ -3883,8 +4205,15 @@ invsqrt(Simd4Double x)
  
  /*! \brief Calculate 1/sqrt(x) for SIMD4 double, but in single accuracy.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
- *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline Simd4Double gmx_simdcall
  invsqrtSingleAccuracy(Simd4Double x)
@@ -3911,8 +4240,15 @@ invsqrtSingleAccuracy(Simd4Double x)
  #if GMX_SIMD_HAVE_FLOAT
  /*! \brief Calculate 1/sqrt(x) for SIMD float, only targeting single accuracy.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
- *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *  \return  1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline SimdFloat gmx_simdcall
  invsqrtSingleAccuracy(SimdFloat x)
@@ -3926,10 +4262,17 @@ invsqrtSingleAccuracy(SimdFloat x)
   *  Illegal values in the masked-out elements will not lead to
   *  floating-point exceptions.
   *
- *  \param x Argument that must be >0 for masked-in entries
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
   *  \param m Mask
- *  \return 1/sqrt(x). Result is undefined if your argument was invalid or
- *          entry was not masked, and 0.0 for masked-out entries.
+ *  \return  1/sqrt(x). Result is undefined if your argument was invalid or
+ *           entry was not masked, and 0.0 for masked-out entries.
   */
  static inline SimdFloat
  maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
@@ -3939,13 +4282,22 @@ maskzInvsqrtSingleAccuracy(SimdFloat x, SimdFBool m)
  
  /*! \brief Calculate 1/sqrt(x) for two SIMD floats, only targeting single accuracy.
   *
- * \param x0  First set of arguments, x0 must be positive - no argument checking.
- * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ * \param x0  First set of arguments, x0 must be in single range (see below).
+ * \param x1  Second set of arguments, x1 must be in single range (see below).
   * \param[out] out0  Result 1/sqrt(x0)
   * \param[out] out1  Result 1/sqrt(x1)
   *
   *  In particular for double precision we can sometimes calculate square root
   *  pairs slightly faster by using single precision until the very last step.
+ *
+ * \note Both arguments must be larger than GMX_FLOAT_MIN and smaller than
+ *       GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *       For the single precision implementation this is obviously always
+ *       true for positive values, but for double precision it adds an
+ *       extra restriction since the first lookup step might have to be
+ *       performed in single precision on some architectures. Note that the
+ *       responsibility for checking falls on you - this routine does not
+ *       check arguments.
   */
  static inline void gmx_simdcall
  invsqrtPairSingleAccuracy(SimdFloat x0,    SimdFloat x1,
@@ -3956,8 +4308,15 @@ invsqrtPairSingleAccuracy(SimdFloat x0,    SimdFloat x1,
  
  /*! \brief Calculate 1/x for SIMD float, only targeting single accuracy.
   *
- *  \param x Argument that must be nonzero. This routine does not check arguments.
- *  \return 1/x. Result is undefined if your argument was invalid.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
+ *  \return  1/x. Result is undefined if your argument was invalid.
   */
  static inline SimdFloat gmx_simdcall
  invSingleAccuracy(SimdFloat x)
@@ -3968,9 +4327,16 @@ invSingleAccuracy(SimdFloat x)
  
  /*! \brief Calculate 1/x for masked SIMD floats, only targeting single accuracy.
   *
- *  \param x Argument that must be nonzero for non-masked entries.
+ *  \param x Argument with magnitude larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
   *  \param m Mask
- *  \return 1/x for elements where m is true, or 0.0 for masked-out entries.
+ *  \return  1/x for elements where m is true, or 0.0 for masked-out entries.
   */
  static inline SimdFloat
  maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
@@ -3978,16 +4344,15 @@ maskzInvSingleAccuracy(SimdFloat x, SimdFBool m)
      return maskzInv(x, m);
  }
  
-/*! \brief Calculate sqrt(x) for SIMD float, only targeting single accuracy.
+/*! \brief Calculate sqrt(x) for SIMD float, always targeting single accuracy.
   *
- *  \param x Argument that must be >=0.
- *  \return sqrt(x). If x=0, the result will correctly be set to 0.
- *          The result is undefined if the input value is negative.
+ * \copydetails sqrt(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  sqrtSingleAccuracy(SimdFloat x)
  {
-    return sqrt(x);
+    return sqrt<opt>(x);
  }
  
  /*! \brief SIMD float log(x), only targeting single accuracy. This is the natural logarithm.
@@ -4003,26 +4368,27 @@ logSingleAccuracy(SimdFloat x)
  
  /*! \brief SIMD float 2^x, only targeting single accuracy.
   *
- * \param x Argument.
- * \result 2^x. Undefined if input argument caused overflow.
+ * \copydetails exp2(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  exp2SingleAccuracy(SimdFloat x)
  {
-    return exp2(x);
+    return exp2<opt>(x);
  }
  
  /*! \brief SIMD float e^x, only targeting single accuracy.
   *
- * \param x Argument.
- * \result exp(x). Undefined if input argument caused overflow.
+ * \copydetails exp(SimdFloat)
   */
+template <MathOptimization opt = MathOptimization::Safe>
  static inline SimdFloat gmx_simdcall
  expSingleAccuracy(SimdFloat x)
  {
-    return exp(x);
+    return exp<opt>(x);
  }
  
+
  /*! \brief SIMD float erf(x), only targeting single accuracy.
   *
   * \param x The value to calculate erf(x) for.
@@ -4183,7 +4549,14 @@ pmePotentialCorrectionSingleAccuracy(SimdFloat z2)
  #if GMX_SIMD4_HAVE_FLOAT
  /*! \brief Calculate 1/sqrt(x) for SIMD4 float, only targeting single accuracy.
   *
- *  \param x Argument that must be >0. This routine does not check arguments.
+ *  \param x Argument that must be larger than GMX_FLOAT_MIN and smaller than
+ *           GMX_FLOAT_MAX, i.e. within the range of single precision.
+ *           For the single precision implementation this is obviously always
+ *           true for positive values, but for double precision it adds an
+ *           extra restriction since the first lookup step might have to be
+ *           performed in single precision on some architectures. Note that the
+ *           responsibility for checking falls on you - this routine does not
+ *           check arguments.
   *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
   */
  static inline Simd4Float gmx_simdcall