From: Berk Hess Date: Wed, 18 Sep 2013 15:39:00 +0000 (+0200) Subject: optimized generic SIMD invsqrt X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=94e69be7bf0d3f6bbf1d8e247f1e566b6149e780;p=alexxy%2Fgromacs.git optimized generic SIMD invsqrt The function gmx_invsqrt_pr now uses one instruction less when FMA is not supported in hardware. Fixes #1333 Change-Id: Idace7296b88a8ecc0331e22d5bb3088753c478de --- diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h index 58929e0726..c997b39b51 100644 --- a/include/gmx_simd_macros.h +++ b/include/gmx_simd_macros.h @@ -247,6 +247,7 @@ #define gmx_sub_pr _mm_sub_ps #define gmx_mul_pr _mm_mul_ps #ifdef GMX_X86_AVX_128_FMA +#define GMX_SIMD_HAVE_FMA #define gmx_madd_pr(a, b, c) _mm_macc_ps(a, b, c) #define gmx_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c) #else @@ -318,6 +319,7 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_ #define gmx_sub_pr _mm_sub_pd #define gmx_mul_pr _mm_mul_pd #ifdef GMX_X86_AVX_128_FMA +#define GMX_SIMD_HAVE_FMA #define gmx_madd_pr(a, b, c) _mm_macc_pd(a, b, c) #define gmx_nmsub_pr(a, b, c) _mm_nmacc_pd(a, b, c) #else diff --git a/include/gmx_simd_math_single.h b/include/gmx_simd_math_single.h index 28feeaa075..9309b8d438 100644 --- a/include/gmx_simd_math_single.h +++ b/include/gmx_simd_math_single.h @@ -40,12 +40,25 @@ static gmx_inline gmx_mm_pr gmx_invsqrt_pr(gmx_mm_pr x) { + /* This is one of the few cases where FMA adds a FLOP, but ends up with + * less instructions in total when FMA is available in hardware. + * Usually we would not optimize this far, but invsqrt is used often. + */ +#ifdef GMX_SIMD_HAVE_FMA const gmx_mm_pr half = gmx_set1_pr(0.5); const gmx_mm_pr one = gmx_set1_pr(1.0); gmx_mm_pr lu = gmx_rsqrt_pr(x); return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu); +#else + const gmx_mm_pr half = gmx_set1_pr(0.5); + const gmx_mm_pr three = gmx_set1_pr(3.0); + + gmx_mm_pr lu = gmx_rsqrt_pr(x); + + return gmx_mul_pr(half, gmx_mul_pr(gmx_sub_pr(three, gmx_mul_pr(gmx_mul_pr(lu, lu), x)), lu)); +#endif }