From 94e69be7bf0d3f6bbf1d8e247f1e566b6149e780 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Wed, 18 Sep 2013 17:39:00 +0200 Subject: [PATCH] optimized generic SIMD invsqrt The function gmx_invsqrt_pr now uses one instruction less when FMA is not supported in hardware. Fixes #1333 Change-Id: Idace7296b88a8ecc0331e22d5bb3088753c478de --- include/gmx_simd_macros.h | 2 ++ include/gmx_simd_math_single.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h index 58929e0726..c997b39b51 100644 --- a/include/gmx_simd_macros.h +++ b/include/gmx_simd_macros.h @@ -247,6 +247,7 @@ #define gmx_sub_pr _mm_sub_ps #define gmx_mul_pr _mm_mul_ps #ifdef GMX_X86_AVX_128_FMA +#define GMX_SIMD_HAVE_FMA #define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c) #define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c) #else @@ -318,6 +319,7 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ #define gmx_sub_pr _mm_sub_pd #define gmx_mul_pr _mm_mul_pd #ifdef GMX_X86_AVX_128_FMA +#define GMX_SIMD_HAVE_FMA #define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c) #define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c) #else diff --git a/include/gmx_simd_math_single.h b/include/gmx_simd_math_single.h index 28feeaa075..9309b8d438 100644 --- a/include/gmx_simd_math_single.h +++ b/include/gmx_simd_math_single.h @@ -40,12 +40,25 @@ static gmx_inline gmx_mm_pr gmx_invsqrt_pr(gmx_mm_pr x) { + /* This is one of the few cases where FMA adds a FLOP, but ends up with + * less instructions in total when FMA is available in hardware. + * Usually we would not optimize this far, but invsqrt is used often. + */ +#ifdef GMX_SIMD_HAVE_FMA const gmx_mm_pr half = gmx_set1_pr(0.5); const gmx_mm_pr one = gmx_set1_pr(1.0); gmx_mm_pr lu = gmx_rsqrt_pr(x); return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu); +#else + const gmx_mm_pr half = gmx_set1_pr(0.5); + const gmx_mm_pr three = gmx_set1_pr(3.0); + + gmx_mm_pr lu = gmx_rsqrt_pr(x); + + return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu)); +#endif } -- 2.22.0