}
}
+/*! \brief Cube root for SIMD floats
+ *
+ * \param x Argument to calculate cube root of. Can be negative or zero,
+ * but NaN or Inf values are not supported. Denormal values will
+ * be treated as 0.0.
+ * \return Cube root of x.
+ */
+static inline SimdFloat gmx_simdcall
+cbrt(SimdFloat x)
+{
+ const SimdFloat signBit(GMX_FLOAT_NEGZERO);
+ const SimdFloat minFloat(std::numeric_limits<float>::min());
+ // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
+ // negative exponent from frexp() is -126, we can subtract one more unit to get 126
+ // as offset, which is divisible by 3.
+ const std::int32_t offset(std::numeric_limits<float>::max_exponent - 2);
+ const SimdFloat c2(-0.191502161678719066F);
+ const SimdFloat c1(0.697570460207922770F);
+ const SimdFloat c0(0.492659620528969547F);
+ const SimdFloat one(1.0F);
+ const SimdFloat two(2.0F);
+ const SimdFloat three(3.0F);
+ const SimdFloat oneThird(1.0F/3.0F);
+ const SimdFloat cbrt2(1.2599210498948731648F);
+ const SimdFloat sqrCbrt2(1.5874010519681994748F);
+
+ // To calculate cbrt(x) we first take the absolute value of x but save the sign,
+ // since cbrt(-x) = -cbrt(x). Then we only need to consider positive values for
+ // the main step.
+ // A number x is represented in IEEE754 as fraction*2^e. We rewrite this as
+ // x=fraction*2^(3*n)*2^m, where e=3*n+m, and m is a remainder.
+ // The cube root can the be evaluated by calculating the cube root of the fraction
+ // limited to the mantissa range, multiplied by 2^mod (which is either 1, +/-2^(1/3) or +/-2^(2/3),
+ // and then we load this into a new IEEE754 fp number with the exponent 2^n, where
+ // n is the integer part of the original exponent divided by 3.
+
+ SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+ SimdFBool xIsNonZero = (minFloat <= xAbs); // treat denormals as 0
+
+ SimdFInt32 exponent;
+ SimdFloat y = frexp(xAbs, &exponent);
+ // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
+ // by first using a polynomial and then evaluating
+ // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
+ // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
+ SimdFloat z = fma(fma(y, c2, c1), y, c0);
+ SimdFloat w = z*z*z;
+ SimdFloat nom = z * fma(two, y, w);
+ SimdFloat invDenom = inv(fma(two, w, y));
+
+ // Handle the exponent. In principle there are beautiful ways to do this with custom 16-bit
+ // division converted to multiplication... but we can't do that since our SIMD layer cannot
+ // assume the presence of integer shift operations!
+ // However, when I first worked with the integer algorithm I still came up with a neat
+ // optimization, so I'll describe the full algorithm here in case we ever want to use it
+ // in the future:
+ //
+ // Our dividend is signed, which is a complication, but let's consider the unsigned case
+ // first: Division by 3 corresponds to multiplication by 1010101... Since we also know
+ // our dividend is less than 16 bits (exponent range) we can accomplish this by
+ // multiplying with 21845 (which is almost 2^16/3 - 21845.333 would be exact) and then
+ // right-shifting by 16 bits to divide out the 2^16 part.
+ // If we add 1 to the dividend to handle the extra 0.333, the integer result will be correct.
+ // To handle the signed exponent one alternative would be to take absolute values, saving
+ // signs, etc - but that gets a bit complicated with 2-complement integers.
+ // Instead, we remember that we don't really want the exact division per se - what we're
+ // really after is only rewriting e = 3*n+m. That will actually be *easier* to handle if
+ // we require that m must be positive (fewer cases to handle) instead of having n as the
+ // strict e/3.
+ // To handle this we start by adding 127 to the exponent. This value corresponds to the
+ // exponent bias, minus 1 because frexp() has a different standard for the value it returns,
+ // but then we add 1 back to handle the extra 0.333 in 21845. So, we have offsetExp = e+127
+ // and then multiply by 21845 to get a division result offsetExpDiv3.
+ // A (signed) value for n is then recovered by subtracting 42 (bias-1)/3 from k.
+ // To calculate a strict remainder we should evaluate offsetExp - 3*offsetExpDiv3 - 1, where
+ // the extra 1 corrects for the value we added to the exponent to get correct division.
+ // This remainder would have the value 0,1, or 2, but since we only use it to select
+ // other numbers we can skip the last step and just handle the cases as 1,2 or 3 instead.
+ //
+ // OK; end of long detour. Here's how we actually do it in our implementation by using
+ // floating-point for the exponent instead to avoid needing integer shifts:
+ //
+ // 1) Convert the exponent (obtained from frexp) to a float
+ // 2) Calculate offsetExp = exp + offset. Note that we should not add the extra 1 here since we
+ // do floating-point division instead of our integer hack, so it's the exponent bias-1, or
+ // the largest exponent minus 2.
+ // 3) Divide the float by 3 by multiplying with 1/3
+ // 4) Truncate it to an integer to get the division result. This is potentially dangerous in
+ // combination with floating-point, because many integers cannot be represented exactly in
+ // floating point, and if we are just epsilon below the result might be truncated to a lower
+ // integer. I have not observed this on x86, but to have a safety margin we can add a small
+ // fraction - since we already know the fraction part should be either 0, 0.333..., or 0.666...
+ // We can even save this extra floating-point addition by adding a small fraction (0.1) when
+ // we introduce the exponent offset - that will correspond to a safety margin of 0.1/3, which is plenty.
+ // 5) Get the remainder part by subtracting the truncated floating-point part.
+ // Here too we will have a plain division, so the remainder is a strict modulus
+ // and will have the values 0, 1 or 2.
+ //
+ // Before worrying about the few wasted cycles due to longer fp latency, this has the
+ // additional advantage that we don't use a single integer operation, so the algorithm
+ // will work just A-OK on all SIMD implementations, which avoids diverging code paths.
+
+ // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
+ SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(offset) + 0.1);
+
+ SimdFloat offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+
+ // Compile-time sanity check that our math is correct
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ SimdFInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdFloat(static_cast<float>(offset/3)));
+
+ SimdFloat remainder = offsetExp - offsetExpDiv3 * three;
+
+ // If remainder is 0 we should just have the factor 1.0,
+ // so first pick 1.0 if it is below 0.5, and 2^(1/3) if it's above 0.5 (i.e., 1 or 2)
+ SimdFloat factor = blend(one, cbrt2, SimdFloat(0.5) < remainder);
+ // Second, we overwrite with 2^(2/3) if rem>1.5 (i.e., 2)
+ factor = blend(factor, sqrCbrt2, SimdFloat(1.5) < remainder);
+
+ // Assemble the non-signed fraction, and add the sign back by xor
+ SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
+ // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
+ SimdFloat result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
+
+ return result;
+}
+
+/*! \brief Inverse cube root for SIMD floats
+ *
+ * \param x Argument to calculate cube root of. Can be positive or
+ * negative, but the magnitude cannot be lower than
+ * the smallest normal number.
+ * \return Cube root of x. Undefined for values that don't
+ * fulfill the restriction of abs(x) > minFloat.
+ */
+static inline SimdFloat gmx_simdcall
+invcbrt(SimdFloat x)
+{
+ const SimdFloat signBit(GMX_FLOAT_NEGZERO);
+ const SimdFloat minFloat(std::numeric_limits<float>::min());
+ // Bias is 128-1 = 127, which is not divisible by 3. Since the largest-magnitude
+ // negative exponent from frexp() is -126, we can subtract one more unit to get 126
+ // as offset, which is divisible by 3.
+ const std::int32_t offset(std::numeric_limits<float>::max_exponent - 2);
+ const SimdFloat c2(-0.191502161678719066F);
+ const SimdFloat c1(0.697570460207922770F);
+ const SimdFloat c0(0.492659620528969547F);
+ const SimdFloat one(1.0F);
+ const SimdFloat two(2.0F);
+ const SimdFloat three(3.0F);
+ const SimdFloat oneThird(1.0F/3.0F);
+ const SimdFloat invCbrt2(1.0F/1.2599210498948731648F);
+ const SimdFloat invSqrCbrt2(1.0F/1.5874010519681994748F);
+
+ // We use pretty much exactly the same implementation as for cbrt(x),
+ // but to compute the inverse we swap the nominator/denominator
+ // in the quotient, and also swap the sign of the exponent parts.
+
+ SimdFloat xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdFloat xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+
+ SimdFInt32 exponent;
+ SimdFloat y = frexp(xAbs, &exponent);
+ // For the mantissa (y) we will use a limited-range approximation of cbrt(y),
+ // by first using a polynomial and then evaluating
+ // Transform y to z = c2*y^2 + c1*y + c0, then w = z^3, and finally
+ // evaluate the quotient q = z * (w + 2 * y) / (2 * w + y).
+ SimdFloat z = fma(fma(y, c2, c1), y, c0);
+ SimdFloat w = z*z*z;
+ SimdFloat nom = fma(two, w, y);
+ SimdFloat invDenom = inv(z * fma(two, y, w));
+
+ // The 0.1 here is the safety margin due to truncation described in item 4 in the comments above.
+ SimdFloat offsetExp = cvtI2R(exponent) + SimdFloat(static_cast<float>(offset) + 0.1);
+ SimdFloat offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+
+ // Compile-time sanity check that our math is correct
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ // We should swap the sign here, so we change order of the terms in the subtraction
+ SimdFInt32 expDiv3 = cvtR2I(SimdFloat(static_cast<float>(offset/3)) - offsetExpDiv3);
+
+ // Swap sign here too, so remainder is either 0, -1 or -2
+ SimdFloat remainder = offsetExpDiv3 * three - offsetExp;
+
+ // If remainder is 0 we should just have the factor 1.0,
+ // so first pick 1.0 if it is above -0.5, and 2^(-1/3) if it's below -0.5 (i.e., -1 or -2)
+ SimdFloat factor = blend(one, invCbrt2, remainder < SimdFloat(-0.5) );
+ // Second, we overwrite with 2^(-2/3) if rem<-1.5 (i.e., -2)
+ factor = blend(factor, invSqrCbrt2, remainder < SimdFloat(-1.5));
+
+ // Assemble the non-signed fraction, and add the sign back by xor
+ SimdFloat fraction = (nom * invDenom * factor) ^ xSignBit;
+ // Load to IEEE754 number, and set result to 0.0 if x was 0.0 or denormal
+ SimdFloat result = ldexp(fraction, expDiv3);
+
+ return result;
+}
+
/*! \brief SIMD float log2(x). This is the base-2 logarithm.
*
* \param x Argument, should be >0.
}
}
+/*! \brief Cube root for SIMD doubles
+ *
+ * \param x Argument to calculate cube root of. Can be negative or zero,
+ * but NaN or Inf values are not supported. Denormal values will
+ * be treated as 0.0.
+ * \return Cube root of x.
+ */
+static inline SimdDouble gmx_simdcall
+cbrt(SimdDouble x)
+{
+ const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
+ const SimdDouble minDouble(std::numeric_limits<double>::min());
+ // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
+ const std::int32_t offset(std::numeric_limits<double>::max_exponent - 1);
+ const SimdDouble c6(-0.145263899385486377);
+ const SimdDouble c5(0.784932344976639262);
+ const SimdDouble c4(-1.83469277483613086);
+ const SimdDouble c3(2.44693122563534430);
+ const SimdDouble c2(-2.11499494167371287);
+ const SimdDouble c1(1.50819193781584896);
+ const SimdDouble c0(0.354895765043919860);
+ const SimdDouble one(1.0);
+ const SimdDouble two(2.0);
+ const SimdDouble three(3.0);
+ const SimdDouble oneThird(1.0/3.0);
+ const SimdDouble cbrt2(1.2599210498948731648);
+ const SimdDouble sqrCbrt2(1.5874010519681994748);
+
+ // See the single precision routines for documentation of the algorithm
+
+ SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+ SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
+
+ SimdDInt32 exponent;
+ SimdDouble y = frexp(xAbs, &exponent);
+ SimdDouble z = fma(y, c6, c5);
+ z = fma(z, y, c4);
+ z = fma(z, y, c3);
+ z = fma(z, y, c2);
+ z = fma(z, y, c1);
+ z = fma(z, y, c0);
+ SimdDouble w = z*z*z;
+ SimdDouble nom = z * fma(two, y, w);
+ SimdDouble invDenom = inv(fma(two, w, y));
+
+ SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(offset) + 0.1);
+ SimdDouble offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offset/3)));
+ SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
+ SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
+ factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
+ SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
+ SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
+ return result;
+}
+
+/*! \brief Inverse cube root for SIMD doubles.
+ *
+ * \param x Argument to calculate cube root of. Can be positive or
+ * negative, but the magnitude cannot be lower than
+ * the smallest normal number.
+ * \return Cube root of x. Undefined for values that don't
+ * fulfill the restriction of abs(x) > minDouble.
+ */
+static inline SimdDouble gmx_simdcall
+invcbrt(SimdDouble x)
+{
+ const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
+ // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
+ const std::int32_t offset(std::numeric_limits<double>::max_exponent - 1);
+ const SimdDouble c6(-0.145263899385486377);
+ const SimdDouble c5(0.784932344976639262);
+ const SimdDouble c4(-1.83469277483613086);
+ const SimdDouble c3(2.44693122563534430);
+ const SimdDouble c2(-2.11499494167371287);
+ const SimdDouble c1(1.50819193781584896);
+ const SimdDouble c0(0.354895765043919860);
+ const SimdDouble one(1.0);
+ const SimdDouble two(2.0);
+ const SimdDouble three(3.0);
+ const SimdDouble oneThird(1.0/3.0);
+ const SimdDouble invCbrt2(1.0/1.2599210498948731648);
+ const SimdDouble invSqrCbrt2(1.0F/1.5874010519681994748);
+
+ // See the single precision routines for documentation of the algorithm
+
+ SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+
+ SimdDInt32 exponent;
+ SimdDouble y = frexp(xAbs, &exponent);
+ SimdDouble z = fma(y, c6, c5);
+ z = fma(z, y, c4);
+ z = fma(z, y, c3);
+ z = fma(z, y, c2);
+ z = fma(z, y, c1);
+ z = fma(z, y, c0);
+ SimdDouble w = z*z*z;
+ SimdDouble nom = fma(two, w, y);
+ SimdDouble invDenom = inv(z * fma(two, y, w));
+ SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(offset) + 0.1);
+ SimdDouble offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offset/3)) - offsetExpDiv3);
+ SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
+ SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5) );
+ factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
+ SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
+ SimdDouble result = ldexp(fraction, expDiv3);
+ return result;
+}
+
/*! \brief SIMD double log2(x). This is the base-2 logarithm.
*
* \param x Argument, should be >0.
}
}
+/*! \brief Cube root for SIMD doubles, single accuracy.
+ *
+ * \param x Argument to calculate cube root of. Can be negative or zero,
+ * but NaN or Inf values are not supported. Denormal values will
+ * be treated as 0.0.
+ * \return Cube root of x.
+ */
+static inline SimdDouble gmx_simdcall
+cbrtSingleAccuracy(SimdDouble x)
+{
+ const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
+ const SimdDouble minDouble(std::numeric_limits<double>::min());
+ // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
+ const std::int32_t offset(std::numeric_limits<double>::max_exponent - 1);
+ const SimdDouble c2(-0.191502161678719066);
+ const SimdDouble c1(0.697570460207922770);
+ const SimdDouble c0(0.492659620528969547);
+ const SimdDouble one(1.0);
+ const SimdDouble two(2.0);
+ const SimdDouble three(3.0);
+ const SimdDouble oneThird(1.0/3.0);
+ const SimdDouble cbrt2(1.2599210498948731648);
+ const SimdDouble sqrCbrt2(1.5874010519681994748);
+
+ // See the single precision routines for documentation of the algorithm
+
+ SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+ SimdDBool xIsNonZero = (minDouble <= xAbs); // treat denormals as 0
+
+ SimdDInt32 exponent;
+ SimdDouble y = frexp(xAbs, &exponent);
+ SimdDouble z = fma(fma(y, c2, c1), y, c0);
+ SimdDouble w = z*z*z;
+ SimdDouble nom = z * fma(two, y, w);
+ SimdDouble invDenom = inv(fma(two, w, y));
+
+ SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(offset) + 0.1);
+ SimdDouble offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ SimdDInt32 expDiv3 = cvtR2I(offsetExpDiv3 - SimdDouble(static_cast<double>(offset/3)));
+ SimdDouble remainder = offsetExp - offsetExpDiv3 * three;
+ SimdDouble factor = blend(one, cbrt2, SimdDouble(0.5) < remainder);
+ factor = blend(factor, sqrCbrt2, SimdDouble(1.5) < remainder);
+ SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
+ SimdDouble result = selectByMask(ldexp(fraction, expDiv3), xIsNonZero);
+ return result;
+}
+
+/*! \brief Inverse cube root for SIMD doubles, single accuracy.
+ *
+ * \param x Argument to calculate cube root of. Can be positive or
+ * negative, but the magnitude cannot be lower than
+ * the smallest normal number.
+ * \return Cube root of x. Undefined for values that don't
+ * fulfill the restriction of abs(x) > minDouble.
+ */
+static inline SimdDouble gmx_simdcall
+invcbrtSingleAccuracy(SimdDouble x)
+{
+ const SimdDouble signBit(GMX_DOUBLE_NEGZERO);
+ // Bias is 1024-1 = 1023, which is divisible by 3, so no need to change it more.
+ const std::int32_t offset(std::numeric_limits<double>::max_exponent - 1);
+ const SimdDouble c2(-0.191502161678719066);
+ const SimdDouble c1(0.697570460207922770);
+ const SimdDouble c0(0.492659620528969547);
+ const SimdDouble one(1.0);
+ const SimdDouble two(2.0);
+ const SimdDouble three(3.0);
+ const SimdDouble oneThird(1.0/3.0);
+ const SimdDouble invCbrt2(1.0/1.2599210498948731648);
+ const SimdDouble invSqrCbrt2(1.0F/1.5874010519681994748);
+
+ // See the single precision routines for documentation of the algorithm
+
+ SimdDouble xSignBit = x & signBit; // create bit mask where the sign bit is 1 for x elements < 0
+ SimdDouble xAbs = andNot(signBit, x); // select everthing but the sign bit => abs(x)
+
+ SimdDInt32 exponent;
+ SimdDouble y = frexp(xAbs, &exponent);
+ SimdDouble z = fma(fma(y, c2, c1), y, c0);
+ SimdDouble w = z*z*z;
+ SimdDouble nom = fma(two, w, y);
+ SimdDouble invDenom = inv(z * fma(two, y, w));
+ SimdDouble offsetExp = cvtI2R(exponent) + SimdDouble(static_cast<double>(offset) + 0.1);
+ SimdDouble offsetExpDiv3 = trunc(offsetExp * oneThird); // important to truncate here to mimic integer division
+ static_assert( offset % 3 == 0, "Internal math algorithm/implementation inconsistency");
+ SimdDInt32 expDiv3 = cvtR2I(SimdDouble(static_cast<double>(offset/3)) - offsetExpDiv3);
+ SimdDouble remainder = offsetExpDiv3 * three - offsetExp;
+ SimdDouble factor = blend(one, invCbrt2, remainder < SimdDouble(-0.5) );
+ factor = blend(factor, invSqrCbrt2, remainder < SimdDouble(-1.5));
+ SimdDouble fraction = (nom * invDenom * factor) ^ xSignBit;
+ SimdDouble result = ldexp(fraction, expDiv3);
+ return result;
+}
+
/*! \brief SIMD log2(x). Double precision SIMD data, single accuracy.
*
* \param x Argument, should be >0.
return sqrt<opt>(x);
}
+/*! \brief Calculate cbrt(x) for SIMD float, always targeting single accuracy.
+ *
+ * \copydetails cbrt(SimdFloat)
+ */
+static inline SimdFloat gmx_simdcall
+cbrtSingleAccuracy(SimdFloat x)
+{
+ return cbrt(x);
+}
+
+/*! \brief Calculate 1/cbrt(x) for SIMD float, always targeting single accuracy.
+ *
+ * \copydetails cbrt(SimdFloat)
+ */
+static inline SimdFloat gmx_simdcall
+invcbrtSingleAccuracy(SimdFloat x)
+{
+ return invcbrt(x);
+}
+
/*! \brief SIMD float log2(x), only targeting single accuracy. This is the base-2 logarithm.
*
* \param x Argument, should be >0.