#ifndef GMX_SIMD_IMPL_INTEL_MIC_H
#define GMX_SIMD_IMPL_INTEL_MIC_H
+#include "config.h"
+
#include <math.h>
+
#include <immintrin.h>
/* Intel Xeon Phi, or
#define gmx_simd_xor_f(a, b) _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
#define gmx_simd_rsqrt_f _mm512_rsqrt23_ps
#define gmx_simd_rcp_f _mm512_rcp23_ps
-#define gmx_simd_fabs_f(x) gmx_simd_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd_fabs_f(x) gmx_simd_andnot_f(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
#define gmx_simd_fneg_f(x) _mm512_addn_ps(x, _mm512_setzero_ps())
#define gmx_simd_max_f _mm512_gmax_ps
#define gmx_simd_min_f _mm512_gmin_ps
#define gmx_simd_xor_d(a, b) _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
#define gmx_simd_rsqrt_d(x) _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x)))
#define gmx_simd_rcp_d(x) _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x)))
-#define gmx_simd_fabs_d(x) gmx_simd_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd_fabs_d(x) gmx_simd_andnot_d(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
#define gmx_simd_fneg_d(x) _mm512_addn_pd(x, _mm512_setzero_pd())
#define gmx_simd_max_d _mm512_gmax_pd
#define gmx_simd_min_d _mm512_gmin_pd
#define gmx_simd4_or_f(a, b) _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
#define gmx_simd4_xor_f(a, b) _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
#define gmx_simd4_rsqrt_f(a) _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, a)
-#define gmx_simd4_fabs_f(x) gmx_simd4_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd4_fabs_f(x) gmx_simd4_andnot_f(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
#define gmx_simd4_fneg_f(x) _mm512_mask_addn_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _mm512_setzero_ps())
#define gmx_simd4_max_f(a, b) _mm512_mask_gmax_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
#define gmx_simd4_min_f(a, b) _mm512_mask_gmin_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
#define gmx_simd4_or_d(a, b) _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
#define gmx_simd4_xor_d(a, b) _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
#define gmx_simd4_rsqrt_d(a) _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(), gmx_simd4_mask, _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), gmx_simd4_mask, x)))
-#define gmx_simd4_fabs_d(x) gmx_simd4_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd4_fabs_d(x) gmx_simd4_andnot_d(_mm512_set1_pd(GMX_DOUBLE_NEGZERO), x)
#define gmx_simd4_fneg_d(x) _mm512_mask_addn_pd(_mm512_undefined_pd(), gmx_simd4_mask, x, _mm512_setzero_pd())
#define gmx_simd4_max_d(a, b) _mm512_mask_gmax_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
#define gmx_simd4_min_d(a, b) _mm512_mask_gmin_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
static gmx_inline __m512 gmx_simdcall
gmx_simd_exp_f_mic(__m512 x)
{
- /* only 59ulp accuracy so we need to do extra an iteration
- Using: http://yacas.sourceforge.net/Algochapter5.html 5.4 Method 3 */
- __m512 r = gmx_simd_exp2_f(_mm512_mul_ps(x, _mm512_set1_ps(1.44269504088896341)));
- __mmask16 m = _mm512_cmpneq_ps_mask(r, _mm512_setzero_ps());
- __m512 t = _mm512_mask_fnmadd_ps(_mm512_mask_log2ae23_ps(_mm512_undefined_ps(), m, r), m, _mm512_set1_ps(0.693147180559945286226764), x);
- return _mm512_mask_fmadd_ps(r, m, t, r);
+ const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
+ const gmx_simd_float_t invargscale = gmx_simd_set1_f(-0.69314718055994528623f);
+ __m512 xscaled = _mm512_mul_ps(x, argscale);
+ __m512 r = gmx_simd_exp2_f_mic(xscaled);
+
+ /* gmx_simd_exp2_f_mic() provides 23 bits of accuracy, but we ruin some of that
+ * with the argument scaling due to single-precision rounding, where the
+ * rounding error is amplified exponentially. To correct this, we find the
+ * difference between the scaled argument and the true one (extended precision
+ * arithmetics does not appear to be necessary to fulfill our accuracy requirements)
+ * and then multiply by the exponent of this correction since exp(a+b)=exp(a)*exp(b).
+ * Note that this only adds two instructions (and maybe some constant loads).
+ */
+ x = gmx_simd_fmadd_f(invargscale, xscaled, x);
+ /* x will now be a _very_ small number, so approximate exp(x)=1+x.
+ * We should thus apply the correction as r'=r*(1+x)=r+r*x
+ */
+ r = gmx_simd_fmadd_f(r, x, r);
+ return r;
}
static gmx_inline __m512 gmx_simdcall