From 94e69be7bf0d3f6bbf1d8e247f1e566b6149e780 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Wed, 18 Sep 2013 17:39:00 +0200
Subject: [PATCH] optimized generic SIMD invsqrt

The function gmx_invsqrt_pr now uses one instruction less when
FMA is not supported in hardware.
Fixes #1333

Change-Id: Idace7296b88a8ecc0331e22d5bb3088753c478de
---
 include/gmx_simd_macros.h      |  2 ++
 include/gmx_simd_math_single.h | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h
index 58929e0726..c997b39b51 100644
--- a/include/gmx_simd_macros.h
+++ b/include/gmx_simd_macros.h
@@ -247,6 +247,7 @@
 #define gmx_sub_pr        _mm_sub_ps
 #define gmx_mul_pr        _mm_mul_ps
 #ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 #else
@@ -318,6 +319,7 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 #define gmx_sub_pr        _mm_sub_pd
 #define gmx_mul_pr        _mm_mul_pd
 #ifdef GMX_X86_AVX_128_FMA
+#define GMX_SIMD_HAVE_FMA
 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 #else
diff --git a/include/gmx_simd_math_single.h b/include/gmx_simd_math_single.h
index 28feeaa075..9309b8d438 100644
--- a/include/gmx_simd_math_single.h
+++ b/include/gmx_simd_math_single.h
@@ -40,12 +40,25 @@
 static gmx_inline gmx_mm_pr
 gmx_invsqrt_pr(gmx_mm_pr x)
 {
+    /* This is one of the few cases where FMA adds a FLOP, but ends up with
+     * less instructions in total when FMA is available in hardware.
+     * Usually we would not optimize this far, but invsqrt is used often.
+     */
+#ifdef GMX_SIMD_HAVE_FMA
     const gmx_mm_pr half  = gmx_set1_pr(0.5);
     const gmx_mm_pr one   = gmx_set1_pr(1.0);
 
     gmx_mm_pr       lu = gmx_rsqrt_pr(x);
 
     return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu);
+#else
+    const gmx_mm_pr half  = gmx_set1_pr(0.5);
+    const gmx_mm_pr three = gmx_set1_pr(3.0);
+
+    gmx_mm_pr       lu = gmx_rsqrt_pr(x);
+    
+    return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu));
+#endif
 }
 
 
-- 
2.22.0