#define gmx_sub_pr _mm_sub_ps
#define gmx_mul_pr _mm_mul_ps
#ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
#define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c)
#define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c)
#else
#define gmx_sub_pr _mm_sub_pd
#define gmx_mul_pr _mm_mul_pd
#ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
#define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c)
#define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c)
#else
static gmx_inline gmx_mm_pr
gmx_invsqrt_pr(gmx_mm_pr x)
{
+ /* This is one of the few cases where FMA adds a FLOP, but ends up with
+ * less instructions in total when FMA is available in hardware.
+ * Usually we would not optimize this far, but invsqrt is used often.
+ */
+#ifdef GMX_SIMD_HAVE_FMA
const gmx_mm_pr half = gmx_set1_pr(0.5);
const gmx_mm_pr one = gmx_set1_pr(1.0);
gmx_mm_pr lu = gmx_rsqrt_pr(x);
return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu);
+#else
+ const gmx_mm_pr half = gmx_set1_pr(0.5);
+ const gmx_mm_pr three = gmx_set1_pr(3.0);
+
+ gmx_mm_pr lu = gmx_rsqrt_pr(x);
+
+ return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
+#endif
}