This is mainly code reorganization.
Adds reference plain-C, slow, arbitrary width SIMD for testing.
Adds FMA for gmx_calc_rsq_pr.
Adds generic SIMD acceleration (also AVX or double) for pme solve.
Moved SIMD vector operations to gmx_simd_vec.h
The math functions invsqrt, inv, pmecorrF and pmecorrV have been
copied from the x86 specific single/double files to generic files
using the SIMD macros from gmx_simd_macros.h.
Moved all architecture specific nbnxn_kernel_simd_utils code to
separate files for each SIMD architecture and replaced all macros
by inline functions.
The SIMD reference nbnxn 2xnn kernels now support 16-wide SIMD.
Adds FMA for in nbnxn kernels for calc_rsq and Coulomb forces.
Refs #1173
Change-Id: Ieda78cc3bcb499e8c17ef8ef539c49cbc2d6d74d
* all that is needed.
*/
-/* Undefine all defines used below so we can include this file multiple times
- * with different settings from the same source file.
- */
+#ifdef _gmx_simd_macros_h_
+#error "gmx_simd_macros.h included twice"
+#else
+#define _gmx_simd_macros_h_
/* NOTE: SSE2 acceleration does not include floor or blendv */
-#undef GMX_SIMD_WIDTH_HERE
-/* float/double SIMD register type */
-#undef gmx_mm_pr
+/* Uncomment the next line, without other SIMD active, for testing plain-C */
+/* #define GMX_SIMD_REFERENCE_PLAIN_C */
+#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+/* Plain C SIMD reference implementation, also serves as documentation */
+#define GMX_HAVE_SIMD_MACROS
-/* integer SIMD register type, only used in the tabulated PME kernels */
-#undef gmx_epi32
+/* In general the reference SIMD supports any SIMD width, including 1.
+ * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
+ * The nbnxn 2xnn kernels are currently not supported.
+ */
+#define GMX_SIMD_REF_WIDTH 4
-#undef gmx_load_pr
-#undef gmx_load1_pr
-#undef gmx_set1_pr
-#undef gmx_setzero_pr
-#undef gmx_store_pr
+/* Include plain-C reference implementation, also serves as documentation */
+#include "gmx_simd_ref.h"
-#undef gmx_add_pr
-#undef gmx_sub_pr
-#undef gmx_mul_pr
+#define GMX_SIMD_WIDTH_HERE GMX_SIMD_REF_WIDTH
+
+/* float/double SIMD register type */
+#define gmx_mm_pr gmx_simd_ref_pr
+
+/* boolean SIMD register type */
+#define gmx_mm_pb gmx_simd_ref_pb
+
+/* integer SIMD register type, only for table indexing and exclusion masks */
+#define gmx_epi32 gmx_simd_ref_epi32
+#define GMX_SIMD_EPI32_WIDTH GMX_SIMD_REF_EPI32_WIDTH
+
+/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
+#define gmx_load_pr gmx_simd_ref_load_pr
+/* Set all SIMD register elements to *r */
+#define gmx_load1_pr gmx_simd_ref_load1_pr
+#define gmx_set1_pr gmx_simd_ref_set1_pr
+#define gmx_setzero_pr gmx_simd_ref_setzero_pr
+#define gmx_store_pr gmx_simd_ref_store_pr
+
+#define gmx_add_pr gmx_simd_ref_add_pr
+#define gmx_sub_pr gmx_simd_ref_sub_pr
+#define gmx_mul_pr gmx_simd_ref_mul_pr
/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-/* d = gmx_madd_pr(a,b,c): d = a*b + c, could use FMA3 or FMA4 */
-#undef gmx_madd_pr
-/* d = gmx_nmsub_pr(a,b,c): d = -a*b + c, could use FMA3 or FMA4 */
-#undef gmx_nmsub_pr
-#undef gmx_max_pr
-#undef gmx_cmplt_pr
-/* gmx_blendzero_pr(real a, boolean b) does: (b ? a : 0) */
-#undef gmx_blendzero_pr
-/* Logical operations on SIMD booleans */
-#undef gmx_and_pr
-#undef gmx_or_pr
-#undef gmx_andnot_pr
+#define gmx_madd_pr gmx_simd_ref_madd_pr
+#define gmx_nmsub_pr gmx_simd_ref_nmsub_pr
+
+#define gmx_max_pr gmx_simd_ref_max_pr
+#define gmx_blendzero_pr gmx_simd_ref_blendzero_pr
+
+#define gmx_round_pr gmx_simd_ref_round_pr
-/* Only used for PBC in bonded interactions, can be avoided */
-#undef gmx_round_pr
/* Not required, only used to speed up the nbnxn tabulated PME kernels */
-#undef GMX_HAVE_SIMD_FLOOR
-#undef gmx_floor_pr
+#define GMX_SIMD_HAVE_FLOOR
+#ifdef GMX_SIMD_HAVE_FLOOR
+#define gmx_floor_pr gmx_simd_ref_floor_pr
+#endif
/* Not required, only used when blendv is faster than comparison */
-#undef GMX_HAVE_SIMD_BLENDV
-#undef gmx_blendv_pr
-/* Not required, gmx_anytrue(x) returns if any of the boolean is x is True.
+#define GMX_SIMD_HAVE_BLENDV
+#ifdef GMX_SIMD_HAVE_BLENDV
+#define gmx_blendv_pr gmx_simd_ref_blendv_pr
+#endif
+
+/* Copy the sign of a to b, assumes b >= 0 for efficiency */
+#define gmx_cpsgn_nonneg_pr gmx_simd_ref_cpsgn_nonneg_pr
+
+/* Very specific operation required in the non-bonded kernels */
+#define gmx_masknot_add_pr gmx_simd_ref_masknot_add_pr
+
+/* Comparison */
+#define gmx_cmplt_pr gmx_simd_ref_cmplt_pr
+
+/* Logical operations on SIMD booleans */
+#define gmx_and_pb gmx_simd_ref_and_pb
+#define gmx_or_pb gmx_simd_ref_or_pb
+
+/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
* If this is not present, define GMX_SIMD_IS_TRUE(real x),
* which should return x==True, where True is True as defined in SIMD.
*/
-#undef GMX_HAVE_SIMD_ANYTRUE
-#undef gmx_anytrue_pr
+#define GMX_SIMD_HAVE_ANYTRUE
+#ifdef GMX_SIMD_HAVE_ANYTRUE
+#define gmx_anytrue_pb gmx_simd_ref_anytrue_pb
+#else
+/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
+#define gmx_store_pb gmx_simd_ref_store_pb
+#endif
-/* Integer set and cast are only used for nbnxn exclusion masks */
-#undef gmx_set1_epi32
-#undef gmx_castsi_pr
/* For topology exclusion pair checking we need: ((a & b) ? True : False)
* when we do a bit-wise and between a and b.
* When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
* Otherwise we do all operations, except for the set1, in reals.
*/
-#undef gmx_load_si
-/* If the same bit is set in both input masks, return all bits 1, otherwise 0 */
-#undef gmx_checkbitmask_epi32
+
+#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#define gmx_set1_epi32 gmx_simd_ref_set1_epi32
+#define gmx_load_si gmx_simd_ref_load_si
+#define gmx_checkbitmask_epi32 gmx_simd_ref_checkbitmask_epi32
+#endif
+
+/* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
+#define gmx_castsi_pr gmx_simd_ref_castsi_pr
/* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
* identical 32-bit masks are set in one double and one or both can be used.
*/
-#undef gmx_checkbitmask_pr
+#define gmx_checkbitmask_pr gmx_simd_ref_checkbitmask_pr
+#endif
/* Conversions only used for PME table lookup */
-#undef gmx_cvttpr_epi32
-#undef gmx_cvtepi32_pr
-
-#undef gmx_invsqrt_pr
-/* sqrt+inv+sin+cos+acos+atan2 are only used for bonded potentials */
-#undef gmx_sqrt_pr
-#undef gmx_inv_pr
-#undef gmx_sincos_pr
-#undef gmx_acos_pr
-#undef gmx_atan_pr
-
-#undef gmx_calc_rsq_pr
-#undef gmx_sum4_pr
-
-/* Only required for nbnxn analytical PME kernels */
-#undef gmx_pmecorrF_pr
-#undef gmx_pmecorrV_pr
-
+#define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32
+#define gmx_cvtepi32_pr gmx_simd_ref_cvtepi32_pr
-/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
-#undef gmx_mm_hpr
-
-#undef gmx_load_hpr
-#undef gmx_load1_hpr
-#undef gmx_store_hpr
-#undef gmx_add_hpr
-#undef gmx_sub_hpr
+/* These two function only need to be approximate, Newton-Raphson iteration
+ * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ */
+#define gmx_rsqrt_pr gmx_simd_ref_rsqrt_pr
+#define gmx_rcp_pr gmx_simd_ref_rcp_pr
-#undef gmx_sum4_hpr
+/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
+#define GMX_SIMD_HAVE_EXP
+#ifdef GMX_SIMD_HAVE_EXP
+#define gmx_exp_pr gmx_simd_ref_exp_pr
+#endif
+#define GMX_SIMD_HAVE_TRIGONOMETRIC
+#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
+#define gmx_sqrt_pr gmx_simd_ref_sqrt_pr
+#define gmx_sincos_pr gmx_simd_ref_sincos_pr
+#define gmx_acos_pr gmx_simd_ref_acos_pr
+#define gmx_atan2_pr gmx_simd_ref_atan2_pr
+#endif
-#undef gmx_2hpr_to_pr
+#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
*/
-/* Generic macros for obtaining a SIMD aligned pointer from pointer x */
-#undef gmx_simd_align_real
-#undef gmx_simd_align_int
-
-
#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
#if defined GMX_X86_AVX_256
/* We have half SIMD width support, continue */
#ifdef GMX_X86_SSE2
+/* This is for general x86 SIMD instruction sets that also support SSE2 */
+#define GMX_HAVE_SIMD_MACROS
+
+/* Include the highest supported x86 SIMD intrisics + math functions */
+#ifdef GMX_X86_AVX_256
+#include "gmx_x86_avx_256.h"
+#ifdef GMX_DOUBLE
+#include "gmx_math_x86_avx_256_double.h"
+#else
+#include "gmx_math_x86_avx_256_single.h"
+#endif
+#else
+#ifdef GMX_X86_AVX_128_FMA
+#include "gmx_x86_avx_128_fma.h"
+#ifdef GMX_DOUBLE
+#include "gmx_math_x86_avx_128_fma_double.h"
+#else
+#include "gmx_math_x86_avx_128_fma_single.h"
+#endif
+#else
+#ifdef GMX_X86_SSE4_1
+#include "gmx_x86_sse4_1.h"
+#ifdef GMX_DOUBLE
+#include "gmx_math_x86_sse4_1_double.h"
+#else
+#include "gmx_math_x86_sse4_1_single.h"
+#endif
+#else
+#ifdef GMX_X86_SSE2
+#include "gmx_x86_sse2.h"
+#ifdef GMX_DOUBLE
+#include "gmx_math_x86_sse2_double.h"
+#else
+#include "gmx_math_x86_sse2_single.h"
+#endif
+#else
+#error No x86 acceleration defined
+#endif
+#endif
+#endif
+#endif
+/* exp and trigonometric functions are included above */
+#define GMX_SIMD_HAVE_EXP
+#define GMX_SIMD_HAVE_TRIGONOMETRIC
#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
#ifndef GMX_DOUBLE
-#include "gmx_x86_simd_single.h"
-
#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m128
+#define gmx_mm_pb __m128
+
#define gmx_epi32 __m128i
+#define GMX_SIMD_EPI32_WIDTH 4
#define gmx_load_pr _mm_load_ps
#define gmx_load1_pr _mm_load1_ps
#define gmx_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
#endif
#define gmx_max_pr _mm_max_ps
-#define gmx_cmplt_pr _mm_cmplt_ps
#define gmx_blendzero_pr _mm_and_ps
-#define gmx_and_pr _mm_and_ps
-#define gmx_or_pr _mm_or_ps
-#define gmx_andnot_pr _mm_andnot_ps
+
+#define gmx_cmplt_pr _mm_cmplt_ps
+#define gmx_and_pb _mm_and_ps
+#define gmx_or_pb _mm_or_ps
#ifdef GMX_X86_SSE4_1
#define gmx_round_pr(x) _mm_round_ps(x, 0x0)
-#define GMX_HAVE_SIMD_FLOOR
+#define GMX_SIMD_HAVE_FLOOR
#define gmx_floor_pr _mm_floor_ps
#else
#define gmx_round_pr(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
#endif
#ifdef GMX_X86_SSE4_1
-#define GMX_HAVE_SIMD_BLENDV
+#define GMX_SIMD_HAVE_BLENDV
#define gmx_blendv_pr _mm_blendv_ps
#endif
-#define GMX_HAVE_SIMD_ANYTRUE
-#define gmx_anytrue_pr _mm_movemask_ps
+static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ /* The value -0.0 has only the sign-bit set */
+ gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
+ return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
+};
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
+
+#define GMX_SIMD_HAVE_ANYTRUE
+#define gmx_anytrue_pb _mm_movemask_ps
+
+#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
#define gmx_set1_epi32 _mm_set1_epi32
-#define gmx_castsi_pr gmx_mm_castsi128_ps
#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
-#define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
#define gmx_cvttpr_epi32 _mm_cvttps_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_ps
-#define gmx_invsqrt_pr gmx_mm_invsqrt_ps
+#define gmx_rsqrt_pr _mm_rsqrt_ps
+#define gmx_rcp_pr _mm_rcp_ps
+
+#define gmx_exp_pr gmx_mm_exp_ps
#define gmx_sqrt_pr gmx_mm_sqrt_ps
-#define gmx_inv_pr gmx_mm_inv_ps
#define gmx_sincos_pr gmx_mm_sincos_ps
#define gmx_acos_pr gmx_mm_acos_ps
#define gmx_atan2_pr gmx_mm_atan2_ps
-#define gmx_calc_rsq_pr gmx_mm_calc_rsq_ps
-#define gmx_sum4_pr gmx_mm_sum4_ps
-
-#define gmx_pmecorrF_pr gmx_mm_pmecorrF_ps
-#define gmx_pmecorrV_pr gmx_mm_pmecorrV_ps
-
#else /* ifndef GMX_DOUBLE */
-#include "gmx_x86_simd_double.h"
-
#define GMX_SIMD_WIDTH_HERE 2
#define gmx_mm_pr __m128d
+#define gmx_mm_pb __m128d
+
#define gmx_epi32 __m128i
+#define GMX_SIMD_EPI32_WIDTH 4
#define gmx_load_pr _mm_load_pd
#define gmx_load1_pr _mm_load1_pd
#define gmx_nmsub_pr(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
#endif
#define gmx_max_pr _mm_max_pd
-#define gmx_cmplt_pr _mm_cmplt_pd
#define gmx_blendzero_pr _mm_and_pd
-#define gmx_and_pr _mm_and_pd
-#define gmx_or_pr _mm_or_pd
-#define gmx_andnot_pr _mm_andnot_pd
#ifdef GMX_X86_SSE4_1
#define gmx_round_pr(x) _mm_round_pd(x, 0x0)
-#define GMX_HAVE_SIMD_FLOOR
+#define GMX_SIMD_HAVE_FLOOR
#define gmx_floor_pr _mm_floor_pd
#else
#define gmx_round_pr(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
#endif
#ifdef GMX_X86_SSE4_1
-#define GMX_HAVE_SIMD_BLENDV
+#define GMX_SIMD_HAVE_BLENDV
#define gmx_blendv_pr _mm_blendv_pd
#endif
-#define GMX_HAVE_SIMD_ANYTRUE
-#define gmx_anytrue_pr _mm_movemask_pd
+static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
+ return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
+};
+
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
+#define gmx_cmplt_pr _mm_cmplt_pd
+
+#define gmx_and_pb _mm_and_pd
+#define gmx_or_pb _mm_or_pd
+
+#define GMX_SIMD_HAVE_ANYTRUE
+#define gmx_anytrue_pb _mm_movemask_pd
+
+#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
#define gmx_set1_epi32 _mm_set1_epi32
-#define gmx_castsi_pr gmx_mm_castsi128_pd
#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
-#define gmx_checkbitmask_epi32(m0, m1) _mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128())
+#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
#define gmx_cvttpr_epi32 _mm_cvttpd_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_pd
-#define gmx_invsqrt_pr gmx_mm_invsqrt_pd
+#define gmx_rsqrt_pr(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
+#define gmx_rcp_pr(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
+
+#define gmx_exp_pr gmx_mm_exp_pd
#define gmx_sqrt_pr gmx_mm_sqrt_pd
-#define gmx_inv_pr gmx_mm_inv_pd
#define gmx_sincos_pr gmx_mm_sincos_pd
#define gmx_acos_pr gmx_mm_acos_pd
#define gmx_atan2_pr gmx_mm_atan2_pd
-#define gmx_calc_rsq_pr gmx_mm_calc_rsq_pd
-#define gmx_sum4_pr gmx_mm_sum4_pd
-
-#define gmx_pmecorrF_pr gmx_mm_pmecorrF_pd
-#define gmx_pmecorrV_pr gmx_mm_pmecorrV_pd
-
#endif /* ifndef GMX_DOUBLE */
#else
#ifndef GMX_DOUBLE
-#include "gmx_x86_simd_single.h"
-
#define GMX_SIMD_WIDTH_HERE 8
#define gmx_mm_pr __m256
+#define gmx_mm_pb __m256
+
#define gmx_epi32 __m256i
+#define GMX_SIMD_EPI32_WIDTH 8
#define gmx_load_pr _mm256_load_ps
#define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
#define gmx_madd_pr(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
#define gmx_nmsub_pr(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
#define gmx_max_pr _mm256_max_ps
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
#define gmx_blendzero_pr _mm256_and_ps
-#define gmx_and_pr _mm256_and_ps
-#define gmx_or_pr _mm256_or_ps
-#define gmx_andnot_pr _mm256_andnot_ps
#define gmx_round_pr(x) _mm256_round_ps(x, 0x0)
-#define GMX_HAVE_SIMD_FLOOR
+#define GMX_SIMD_HAVE_FLOOR
#define gmx_floor_pr _mm256_floor_ps
-#define GMX_HAVE_SIMD_BLENDV
+#define GMX_SIMD_HAVE_BLENDV
#define gmx_blendv_pr _mm256_blendv_ps
-#define GMX_HAVE_SIMD_ANYTRUE
-#define gmx_anytrue_pr _mm256_movemask_ps
+static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
+ return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
+};
+
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
+
+/* Less-than (we use ordered, non-signaling, but that's not required) */
+#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
+#define gmx_and_pb _mm256_and_ps
+#define gmx_or_pb _mm256_or_ps
+#define GMX_SIMD_HAVE_ANYTRUE
+#define gmx_anytrue_pb _mm256_movemask_ps
+
+#define GMX_SIMD_HAVE_CHECKBITMASK_PR
#define gmx_set1_epi32 _mm256_set1_epi32
#define gmx_castsi_pr _mm256_castsi256_ps
/* With <= 16 bits used the cast and conversion should not be required,
#define gmx_cvttpr_epi32 _mm256_cvttps_epi32
-#define gmx_invsqrt_pr gmx_mm256_invsqrt_ps
+#define gmx_rsqrt_pr _mm256_rsqrt_ps
+#define gmx_rcp_pr _mm256_rcp_ps
+
+#define gmx_exp_pr gmx_mm256_exp_ps
#define gmx_sqrt_pr gmx_mm256_sqrt_ps
-#define gmx_inv_pr gmx_mm256_inv_ps
#define gmx_sincos_pr gmx_mm256_sincos_ps
#define gmx_acos_pr gmx_mm256_acos_ps
#define gmx_atan2_pr gmx_mm256_atan2_ps
-#define gmx_calc_rsq_pr gmx_mm256_calc_rsq_ps
-#define gmx_sum4_pr gmx_mm256_sum4_ps
-
-#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
-#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
-
#else
-#include "gmx_x86_simd_double.h"
-
#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m256d
+#define gmx_mm_pb __m256d
+
/* We use 128-bit integer registers because of missing 256-bit operations */
#define gmx_epi32 __m128i
+#define GMX_SIMD_EPI32_WIDTH 4
#define gmx_load_pr _mm256_load_pd
#define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
#define gmx_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
#define gmx_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
#define gmx_max_pr _mm256_max_pd
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
#define gmx_blendzero_pr _mm256_and_pd
-#define gmx_and_pr _mm256_and_pd
-#define gmx_or_pr _mm256_or_pd
-#define gmx_andnot_pr _mm256_andnot_pd
#define gmx_round_pr(x) _mm256_round_pd(x, 0x0)
-#define GMX_HAVE_SIMD_FLOOR
+#define GMX_SIMD_HAVE_FLOOR
#define gmx_floor_pr _mm256_floor_pd
-#define GMX_HAVE_SIMD_BLENDV
+#define GMX_SIMD_HAVE_BLENDV
#define gmx_blendv_pr _mm256_blendv_pd
-#define GMX_HAVE_SIMD_ANYTRUE
-#define gmx_anytrue_pr _mm256_movemask_pd
+static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
+ return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
+};
+
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
+
+/* Less-than (we use ordered, non-signaling, but that's not required) */
+#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
+
+#define gmx_and_pb _mm256_and_pd
+#define gmx_or_pb _mm256_or_pd
+#define GMX_SIMD_HAVE_ANYTRUE
+#define gmx_anytrue_pb _mm256_movemask_pd
+
+#define GMX_SIMD_HAVE_CHECKBITMASK_PR
#define gmx_set1_epi32 _mm256_set1_epi32
#define gmx_castsi_pr _mm256_castsi256_pd
/* With <= 16 bits used the cast and conversion should not be required,
#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
-#define gmx_invsqrt_pr gmx_mm256_invsqrt_pd
+#define gmx_rsqrt_pr(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
+#define gmx_rcp_pr(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
+
+#define gmx_exp_pr gmx_mm256_exp_pd
#define gmx_sqrt_pr gmx_mm256_sqrt_pd
-#define gmx_inv_pr gmx_mm256_inv_pd
#define gmx_sincos_pr gmx_mm256_sincos_pd
#define gmx_acos_pr gmx_mm256_acos_pd
#define gmx_atan2_pr gmx_mm256_atan2_pd
-#define gmx_calc_rsq_pr gmx_mm256_calc_rsq_pd
-#define gmx_sum4_pr gmx_mm256_sum4_pd
-
-#define gmx_pmecorrF_pr gmx_mm256_pmecorrF_pd
-#define gmx_pmecorrV_pr gmx_mm256_pmecorrV_pd
-
#endif /* GMX_DOUBLE */
#endif /* 128- or 256-bit x86 SIMD */
#endif /* GMX_X86_SSE2 */
-/* Generic macros to extract a SIMD aligned pointer from a pointer x.
+#ifdef GMX_HAVE_SIMD_MACROS
+/* Generic functions to extract a SIMD aligned pointer from a pointer x.
* x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
* to how many you want to use, to avoid indexing outside the aligned region.
*/
-#define gmx_simd_align_real(x) (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))))
+static gmx_inline real *
+gmx_simd_align_real(const real *x)
+{
+ return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
+}
+
+static gmx_inline int *
+gmx_simd_align_int(const int *x)
+{
+ return (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
+}
+
+
+/* Include the math functions which only need the above macros,
+ * generally these are the ones that don't need masking operations.
+ */
+#ifdef GMX_DOUBLE
+#include "gmx_simd_math_double.h"
+#else
+#include "gmx_simd_math_single.h"
+#endif
+
+#endif /* GMX_HAVE_SIMD_MACROS */
-#define gmx_simd_align_int(x) (int *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))))
+#endif /* _gmx_simd_macros_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _gmx_simd_math_double_h_
+#define _gmx_simd_math_double_h_
+
+
+/* 1.0/sqrt(x) */
+static gmx_inline gmx_mm_pr
+gmx_invsqrt_pr(gmx_mm_pr x)
+{
+ const gmx_mm_pr half = gmx_set1_pr(0.5);
+ const gmx_mm_pr three = gmx_set1_pr(3.0);
+
+ /* Lookup instruction only exists in single precision, convert back and forth... */
+ gmx_mm_pr lu = gmx_rsqrt_pr(x);
+
+ lu = gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three));
+ return gmx_mul_pr(gmx_mul_pr(half, lu), gmx_nmsub_pr(gmx_mul_pr(lu, lu), x, three));
+}
+
+
+/* 1.0/x */
+static gmx_inline gmx_mm_pr
+gmx_inv_pr(gmx_mm_pr x)
+{
+ const gmx_mm_pr two = gmx_set1_pr(2.0);
+
+ /* Lookup instruction only exists in single precision, convert back and forth... */
+ gmx_mm_pr lu = gmx_rcp_pr(x);
+
+ /* Perform two N-R steps for double precision */
+ lu = gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
+ return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
+}
+
+
+/* Calculate the force correction due to PME analytically.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be Erfc(beta*r)/r, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of beta*r that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
+ *
+ * V= 1/r - Erf(beta*r)/r
+ *
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument beta*r will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved Erf(z)/z is
+ * in this range!
+ *
+ * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate f(z)/z and
+ * then only use even powers. This is another minor optimization, since
+ * we actually WANT f(z)/z, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * 2*exp(-z^2) erf(z)
+ * ------------ - --------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * 5. Multiply the entire expression by beta^3. This will get you
+ *
+ * beta^3*2*exp(-z^2) beta^3*erf(z)
+ * ------------------ - ---------------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * or, switching back to r (z=r*beta):
+ *
+ * 2*beta*exp(-r^2*beta^2) erf(r*beta)
+ * ----------------------- - -----------
+ * sqrt(Pi)*r^2 r^3
+ *
+ *
+ * With a bit of math exercise you should be able to confirm that
+ * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
+ *
+ * 6. Add the result to 1/r^3, multiply by the product of the charges,
+ * and you have your force (divided by r). A final multiplication
+ * with the vector connecting the two particles and you have your
+ * vectorial force to add to the particles.
+ *
+ */
+static gmx_mm_pr
+gmx_pmecorrF_pr(gmx_mm_pr z2)
+{
+ const gmx_mm_pr FN10 = gmx_set1_pr(-8.0072854618360083154e-14);
+ const gmx_mm_pr FN9 = gmx_set1_pr(1.1859116242260148027e-11);
+ const gmx_mm_pr FN8 = gmx_set1_pr(-8.1490406329798423616e-10);
+ const gmx_mm_pr FN7 = gmx_set1_pr(3.4404793543907847655e-8);
+ const gmx_mm_pr FN6 = gmx_set1_pr(-9.9471420832602741006e-7);
+ const gmx_mm_pr FN5 = gmx_set1_pr(0.000020740315999115847456);
+ const gmx_mm_pr FN4 = gmx_set1_pr(-0.00031991745139313364005);
+ const gmx_mm_pr FN3 = gmx_set1_pr(0.0035074449373659008203);
+ const gmx_mm_pr FN2 = gmx_set1_pr(-0.031750380176100813405);
+ const gmx_mm_pr FN1 = gmx_set1_pr(0.13884101728898463426);
+ const gmx_mm_pr FN0 = gmx_set1_pr(-0.75225277815249618847);
+
+ const gmx_mm_pr FD5 = gmx_set1_pr(0.000016009278224355026701);
+ const gmx_mm_pr FD4 = gmx_set1_pr(0.00051055686934806966046);
+ const gmx_mm_pr FD3 = gmx_set1_pr(0.0081803507497974289008);
+ const gmx_mm_pr FD2 = gmx_set1_pr(0.077181146026670287235);
+ const gmx_mm_pr FD1 = gmx_set1_pr(0.41543303143712535988);
+ const gmx_mm_pr FD0 = gmx_set1_pr(1.0);
+
+ gmx_mm_pr z4;
+ gmx_mm_pr polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = gmx_mul_pr(z2, z2);
+
+ polyFD1 = gmx_madd_pr(FD5, z4, FD3);
+ polyFD1 = gmx_madd_pr(polyFD1, z4, FD1);
+ polyFD1 = gmx_mul_pr(polyFD1, z2);
+ polyFD0 = gmx_madd_pr(FD4, z4, FD2);
+ polyFD0 = gmx_madd_pr(polyFD0, z4, FD0);
+ polyFD0 = gmx_add_pr(polyFD0, polyFD1);
+
+ polyFD0 = gmx_inv_pr(polyFD0);
+
+ polyFN0 = gmx_madd_pr(FN10, z4, FN8);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN6);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN4);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN2);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN0);
+ polyFN1 = gmx_madd_pr(FN9, z4, FN7);
+ polyFN1 = gmx_madd_pr(polyFN1, z4, FN5);
+ polyFN1 = gmx_madd_pr(polyFN1, z4, FN3);
+ polyFN1 = gmx_madd_pr(polyFN1, z4, FN1);
+ polyFN0 = gmx_madd_pr(polyFN1, z2, polyFN0);
+
+ return gmx_mul_pr(polyFN0, polyFD0);
+}
+
+
+/* Calculate the potential correction due to PME analytically.
+ *
+ * This routine calculates Erf(z)/z, although you should provide z^2
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * erf(z)
+ * --------
+ * z
+ *
+ * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
+ *
+ * erf(r*beta)
+ * -----------
+ * r
+ *
+ * 6. Subtract the result from 1/r, multiply by the product of the charges,
+ * and you have your potential.
+ *
+ */
+static gmx_mm_pr
+gmx_pmecorrV_pr(gmx_mm_pr z2)
+{
+ const gmx_mm_pr VN9 = gmx_set1_pr(-9.3723776169321855475e-13);
+ const gmx_mm_pr VN8 = gmx_set1_pr(1.2280156762674215741e-10);
+ const gmx_mm_pr VN7 = gmx_set1_pr(-7.3562157912251309487e-9);
+ const gmx_mm_pr VN6 = gmx_set1_pr(2.6215886208032517509e-7);
+ const gmx_mm_pr VN5 = gmx_set1_pr(-4.9532491651265819499e-6);
+ const gmx_mm_pr VN4 = gmx_set1_pr(0.00025907400778966060389);
+ const gmx_mm_pr VN3 = gmx_set1_pr(0.0010585044856156469792);
+ const gmx_mm_pr VN2 = gmx_set1_pr(0.045247661136833092885);
+ const gmx_mm_pr VN1 = gmx_set1_pr(0.11643931522926034421);
+ const gmx_mm_pr VN0 = gmx_set1_pr(1.1283791671726767970);
+
+ const gmx_mm_pr VD5 = gmx_set1_pr(0.000021784709867336150342);
+ const gmx_mm_pr VD4 = gmx_set1_pr(0.00064293662010911388448);
+ const gmx_mm_pr VD3 = gmx_set1_pr(0.0096311444822588683504);
+ const gmx_mm_pr VD2 = gmx_set1_pr(0.085608012351550627051);
+ const gmx_mm_pr VD1 = gmx_set1_pr(0.43652499166614811084);
+ const gmx_mm_pr VD0 = gmx_set1_pr(1.0);
+
+ gmx_mm_pr z4;
+ gmx_mm_pr polyVN0, polyVN1, polyVD0, polyVD1;
+
+ z4 = gmx_mul_pr(z2, z2);
+
+ polyVD1 = gmx_madd_pr(VD5, z4, VD3);
+ polyVD0 = gmx_madd_pr(VD4, z4, VD2);
+ polyVD1 = gmx_madd_pr(polyVD1, z4, VD1);
+ polyVD0 = gmx_madd_pr(polyVD0, z4, VD0);
+ polyVD0 = gmx_madd_pr(polyVD1, z2, polyVD0);
+
+ polyVD0 = gmx_inv_pr(polyVD0);
+
+ polyVN1 = gmx_madd_pr(VN9, z4, VN7);
+ polyVN0 = gmx_madd_pr(VN8, z4, VN6);
+ polyVN1 = gmx_madd_pr(polyVN1, z4, VN5);
+ polyVN0 = gmx_madd_pr(polyVN0, z4, VN4);
+ polyVN1 = gmx_madd_pr(polyVN1, z4, VN3);
+ polyVN0 = gmx_madd_pr(polyVN0, z4, VN2);
+ polyVN1 = gmx_madd_pr(polyVN1, z4, VN1);
+ polyVN0 = gmx_madd_pr(polyVN0, z4, VN0);
+ polyVN0 = gmx_madd_pr(polyVN1, z2, polyVN0);
+
+ return gmx_mul_pr(polyVN0, polyVD0);
+}
+
+
+#endif /*_gmx_simd_math_double_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _gmx_simd_math_single_h_
+#define _gmx_simd_math_single_h_
+
+
+/* 1.0/sqrt(x) */
+static gmx_inline gmx_mm_pr
+gmx_invsqrt_pr(gmx_mm_pr x)
+{
+ const gmx_mm_pr half = gmx_set1_pr(0.5);
+ const gmx_mm_pr one = gmx_set1_pr(1.0);
+
+ gmx_mm_pr lu = gmx_rsqrt_pr(x);
+
+ return gmx_madd_pr(gmx_nmsub_pr(x, gmx_mul_pr(lu, lu), one), gmx_mul_pr(lu, half), lu);
+}
+
+
+/* 1.0/x */
+static gmx_inline gmx_mm_pr
+gmx_inv_pr(gmx_mm_pr x)
+{
+ const gmx_mm_pr two = gmx_set1_pr(2.0);
+
+ gmx_mm_pr lu = gmx_rcp_pr(x);
+
+ return gmx_mul_pr(lu, gmx_nmsub_pr(lu, x, two));
+}
+
+
+/* Calculate the force correction due to PME analytically.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be Erfc(beta*r)/r, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of beta*r that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
+ *
+ * V= 1/r - Erf(beta*r)/r
+ *
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument beta*r will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved Erf(z)/z is
+ * in this range!
+ *
+ * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate f(z)/z and
+ * then only use even powers. This is another minor optimization, since
+ * we actually WANT f(z)/z, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * 2*exp(-z^2) erf(z)
+ * ------------ - --------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * 5. Multiply the entire expression by beta^3. This will get you
+ *
+ * beta^3*2*exp(-z^2) beta^3*erf(z)
+ * ------------------ - ---------------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * or, switching back to r (z=r*beta):
+ *
+ * 2*beta*exp(-r^2*beta^2) erf(r*beta)
+ * ----------------------- - -----------
+ * sqrt(Pi)*r^2 r^3
+ *
+ *
+ * With a bit of math exercise you should be able to confirm that
+ * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
+ *
+ * 6. Add the result to 1/r^3, multiply by the product of the charges,
+ * and you have your force (divided by r). A final multiplication
+ * with the vector connecting the two particles and you have your
+ * vectorial force to add to the particles.
+ *
+ */
+static gmx_mm_pr
+gmx_pmecorrF_pr(gmx_mm_pr z2)
+{
+ const gmx_mm_pr FN6 = gmx_set1_pr(-1.7357322914161492954e-8f);
+ const gmx_mm_pr FN5 = gmx_set1_pr(1.4703624142580877519e-6f);
+ const gmx_mm_pr FN4 = gmx_set1_pr(-0.000053401640219807709149f);
+ const gmx_mm_pr FN3 = gmx_set1_pr(0.0010054721316683106153f);
+ const gmx_mm_pr FN2 = gmx_set1_pr(-0.019278317264888380590f);
+ const gmx_mm_pr FN1 = gmx_set1_pr(0.069670166153766424023f);
+ const gmx_mm_pr FN0 = gmx_set1_pr(-0.75225204789749321333f);
+
+ const gmx_mm_pr FD4 = gmx_set1_pr(0.0011193462567257629232f);
+ const gmx_mm_pr FD3 = gmx_set1_pr(0.014866955030185295499f);
+ const gmx_mm_pr FD2 = gmx_set1_pr(0.11583842382862377919f);
+ const gmx_mm_pr FD1 = gmx_set1_pr(0.50736591960530292870f);
+ const gmx_mm_pr FD0 = gmx_set1_pr(1.0f);
+
+ gmx_mm_pr z4;
+ gmx_mm_pr polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = gmx_mul_pr(z2, z2);
+
+ polyFD0 = gmx_madd_pr(FD4, z4, FD2);
+ polyFD1 = gmx_madd_pr(FD3, z4, FD1);
+ polyFD0 = gmx_madd_pr(polyFD0, z4, FD0);
+ polyFD0 = gmx_madd_pr(polyFD1, z2, polyFD0);
+
+ polyFD0 = gmx_inv_pr(polyFD0);
+
+ polyFN0 = gmx_madd_pr(FN6, z4, FN4);
+ polyFN1 = gmx_madd_pr(FN5, z4, FN3);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN2);
+ polyFN1 = gmx_madd_pr(polyFN1, z4, FN1);
+ polyFN0 = gmx_madd_pr(polyFN0, z4, FN0);
+ polyFN0 = gmx_madd_pr(polyFN1, z2, polyFN0);
+
+ return gmx_mul_pr(polyFN0, polyFD0);
+}
+
+
+/* Calculate the potential correction due to PME analytically.
+ *
+ * See gmx_pmecorrF_pr() for details about the approximation.
+ *
+ * This routine calculates Erf(z)/z, although you should provide z^2
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * erf(z)
+ * --------
+ * z
+ *
+ * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
+ *
+ * erf(r*beta)
+ * -----------
+ * r
+ *
+ * 6. Add the result to 1/r, multiply by the product of the charges,
+ * and you have your potential.
+ */
+static gmx_mm_pr
+gmx_pmecorrV_pr(gmx_mm_pr z2)
+{
+ const gmx_mm_pr VN6 = gmx_set1_pr(1.9296833005951166339e-8f);
+ const gmx_mm_pr VN5 = gmx_set1_pr(-1.4213390571557850962e-6f);
+ const gmx_mm_pr VN4 = gmx_set1_pr(0.000041603292906656984871f);
+ const gmx_mm_pr VN3 = gmx_set1_pr(-0.00013134036773265025626f);
+ const gmx_mm_pr VN2 = gmx_set1_pr(0.038657983986041781264f);
+ const gmx_mm_pr VN1 = gmx_set1_pr(0.11285044772717598220f);
+ const gmx_mm_pr VN0 = gmx_set1_pr(1.1283802385263030286f);
+
+ const gmx_mm_pr VD3 = gmx_set1_pr(0.0066752224023576045451f);
+ const gmx_mm_pr VD2 = gmx_set1_pr(0.078647795836373922256f);
+ const gmx_mm_pr VD1 = gmx_set1_pr(0.43336185284710920150f);
+ const gmx_mm_pr VD0 = gmx_set1_pr(1.0f);
+
+ gmx_mm_pr z4;
+ gmx_mm_pr polyVN0, polyVN1, polyVD0, polyVD1;
+
+ z4 = gmx_mul_pr(z2, z2);
+
+ polyVD1 = gmx_madd_pr(VD3, z4, VD1);
+ polyVD0 = gmx_madd_pr(VD2, z4, VD0);
+ polyVD0 = gmx_madd_pr(polyVD1, z2, polyVD0);
+
+ polyVD0 = gmx_inv_pr(polyVD0);
+
+ polyVN0 = gmx_madd_pr(VN6, z4, VN4);
+ polyVN1 = gmx_madd_pr(VN5, z4, VN3);
+ polyVN0 = gmx_madd_pr(polyVN0, z4, VN2);
+ polyVN1 = gmx_madd_pr(polyVN1, z4, VN1);
+ polyVN0 = gmx_madd_pr(polyVN0, z4, VN0);
+ polyVN0 = gmx_madd_pr(polyVN1, z2, polyVN0);
+
+ return gmx_mul_pr(polyVN0, polyVD0);
+}
+
+
+#endif /* _gmx_simd_math_single_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _gmx_simd_ref_h_
+#define _gmx_simd_ref_h_
+
+/* This file contains a reference plain-C implementation of arbitrary width.
+ * This code is only useful for testing and documentation.
+ * The SIMD width is set by defining GMX_SIMD_REF_WIDTH before including.
+ */
+
+
+#ifndef GMX_SIMD_REF_WIDTH
+#error "GMX_SIMD_REF_WIDTH should be defined before including gmx_simd_ref.h"
+#endif
+
+#include <math.h>
+
+/* float/double SIMD register type */
+typedef struct {
+ real r[GMX_SIMD_REF_WIDTH];
+} gmx_simd_ref_pr;
+
+/* boolean SIMD register type */
+typedef struct {
+ char r[GMX_SIMD_REF_WIDTH];
+} gmx_simd_ref_pb;
+
+/* integer SIMD register type, only for table indexing and exclusion masks */
+typedef struct {
+ int r[GMX_SIMD_REF_WIDTH];
+} gmx_simd_ref_epi32;
+#define GMX_SIMD_REF_EPI32_WIDTH GMX_SIMD_REF_WIDTH
+
+/* Load GMX_SIMD_REF_WIDTH reals for memory starting at r */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_load_pr(const real *r)
+{
+ gmx_simd_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = r[i];
+ }
+
+ return a;
+}
+
+/* Set all SIMD register elements to *r */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_load1_pr(const real *r)
+{
+ gmx_simd_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = *r;
+ }
+
+ return a;
+}
+
+/* Set all SIMD register elements to r */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_set1_pr(real r)
+{
+ gmx_simd_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = r;
+ }
+
+ return a;
+}
+
+/* Set all SIMD register elements to 0 */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_setzero_pr()
+{
+ gmx_simd_ref_pr a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = 0.0;
+ }
+
+ return a;
+}
+
+static gmx_inline void
+gmx_simd_ref_store_pr(real *dest, gmx_simd_ref_pr src)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ dest[i] = src.r[i];
+ }
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_add_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_sub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] - b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_mul_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = a.r[i]*b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_madd_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
+{
+ gmx_simd_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ d.r[i] = a.r[i]*b.r[i] + c.r[i];
+ }
+
+ return d;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_nmsub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
+{
+ gmx_simd_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ d.r[i] = -a.r[i]*b.r[i] + c.r[i];
+ }
+
+ return d;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_max_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_blendzero_pr(gmx_simd_ref_pr a, gmx_simd_ref_pb b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (b.r[i] ? a.r[i] : 0.0);
+ }
+
+ return c;
+}
+
+/* Note that this reference implementation rounds away from zero,
+ * whereas most SIMD intrinsics will round to nearest even.
+ * Since this function is only used for periodic image calculations,
+ * the rounding of mantissas close to 0.5 is irrelevant.
+ */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_round_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+#ifdef GMX_DOUBLE
+ b.r[i] = round(a.r[i]);
+#else
+ b.r[i] = roundf(a.r[i]);
+#endif
+ }
+
+ return b;
+}
+
+/* Not required, only used to speed up the nbnxn tabulated PME kernels */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_floor_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+#ifdef GMX_DOUBLE
+ b.r[i] = floor(a.r[i]);
+#else
+ b.r[i] = floorf(a.r[i]);
+#endif
+ }
+
+ return b;
+}
+
+/* Not required, only used when blendv is faster than comparison */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_blendv_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
+{
+ gmx_simd_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ d.r[i] = (c.r[i] >= 0) ? a.r[i] : b.r[i];
+ }
+
+ return d;
+}
+
+/* Copy the sign of a to b, assumes b >= 0 for efficiency */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_cpsgn_nonneg_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] >= 0) ? b.r[i] : -b.r[i];
+ }
+
+ return c;
+}
+
+/* Very specific operation required in the non-bonded kernels */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_masknot_add_pr(gmx_simd_ref_pb a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
+{
+ gmx_simd_ref_pr d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ d.r[i] = a.r[i] ? b.r[i] : b.r[i] + c.r[i];
+ }
+
+ return d;
+}
+
+/* Comparison */
+static gmx_inline gmx_simd_ref_pb
+gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] < b.r[i]);
+ }
+
+ return c;
+}
+
+/* Logical AND on SIMD booleans */
+static gmx_inline gmx_simd_ref_pb
+gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
+{
+ gmx_simd_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] && b.r[i]);
+ }
+
+ return c;
+}
+
+/* Logical OR on SIMD booleans */
+static gmx_inline gmx_simd_ref_pb
+gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
+{
+ gmx_simd_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] || b.r[i]);
+ }
+
+ return c;
+}
+
+/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
+ * If this is not present, define GMX_SIMD_IS_TRUE(real x),
+ * which should return x==True, where True is True as defined in SIMD.
+ */
+static gmx_inline int
+gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ if (a.r[i])
+ {
+ anytrue = 1;
+ }
+ }
+
+ return anytrue;
+}
+
+/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
+static gmx_inline void
+gmx_simd_ref_store_pb(real *dest, gmx_simd_ref_pb src)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ dest[i] = src.r[i];
+ }
+};
+
+
+/* For topology exclusion pair checking we need: ((a & b) ? True : False)
+ * when we do a bit-wise and between a and b.
+ * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
+ * Otherwise we do all operations, except for the set1, in reals.
+ */
+
+/* Integer set and cast are only used for nbnxn exclusion masks */
+static gmx_inline gmx_simd_ref_epi32
+gmx_simd_ref_set1_epi32(int src)
+{
+ gmx_simd_ref_epi32 a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = src;
+ }
+
+ return a;
+}
+
+static gmx_inline gmx_simd_ref_epi32
+gmx_simd_ref_load_si(const int *src)
+{
+ gmx_simd_ref_epi32 a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = src[i];
+ }
+
+ return a;
+}
+
+/* If the same bit is set in both input masks, return TRUE, else FALSE.
+ * This function is only called with a single bit set in b.
+ */
+static gmx_inline gmx_simd_ref_pb
+gmx_simd_ref_checkbitmask_epi32(gmx_simd_ref_epi32 a, gmx_simd_ref_epi32 b)
+{
+ gmx_simd_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = ((a.r[i] & b.r[i]) != 0);
+ }
+
+ return c;
+}
+
+
+/* Conversions only used for PME table lookup */
+static gmx_inline gmx_simd_ref_epi32
+gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_epi32 b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ b.r[i] = (int)a.r[i];
+ }
+
+ return b;
+};
+
+/* These two function only need to be approximate, Newton-Raphson iteration
+ * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
+ */
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+#ifdef GMX_DOUBLE
+ b.r[i] = 1.0/sqrt(a.r[i]);
+#else
+ b.r[i] = 1.0/sqrtf(a.r[i]);
+#endif
+ }
+
+ return b;
+};
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_rcp_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ b.r[i] = 1.0/a.r[i];
+ }
+
+ return b;
+};
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_exp_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+#ifdef GMX_DOUBLE
+ b.r[i] = exp(a.r[i]);
+#else
+ b.r[i] = expf(a.r[i]);
+#endif
+ }
+
+ return b;
+};
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_sqrt_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+#ifdef GMX_DOUBLE
+ b.r[i] = sqrt(a.r[i]);
+#else
+ b.r[i] = sqrtf(a.r[i]);
+#endif
+ }
+
+ return b;
+}
+
+static gmx_inline int
+gmx_simd_ref_sincos_pr(gmx_simd_ref_pr a,
+ gmx_simd_ref_pr *s, gmx_simd_ref_pr *c)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ s->r[i] = sin(a.r[i]);
+ c->r[i] = cos(a.r[i]);
+ }
+
+ return 0;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_acos_pr(gmx_simd_ref_pr a)
+{
+ gmx_simd_ref_pr b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ b.r[i] = acos(a.r[i]);
+ }
+
+ return b;
+}
+
+static gmx_inline gmx_simd_ref_pr
+gmx_simd_ref_atan2_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_simd_ref_pr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = atan2(a.r[i], b.r[i]);
+ }
+
+ return c;
+}
+
+#endif /* _gmx_simd_ref_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/* The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
+/* This file contains vector operation functions using SIMD intrinsics.
+ * gmx_simd_macros.h should be included before including this file.
+ */
+
+#ifndef _gmx_simd_vec_h_
+#define _gmx_simd_vec_h_
+
+#ifndef _gmx_simd_macros_h_
+#error "gmx_simd_macros.h was not included before including gmx_simd_vec.h"
+#endif
+
+
+/* x^2 + y^2 + z^2 */
+static gmx_inline gmx_mm_pr
+gmx_calc_rsq_pr(gmx_mm_pr x, gmx_mm_pr y, gmx_mm_pr z)
+{
+ return gmx_madd_pr(z, z, gmx_madd_pr(y, y, gmx_mul_pr(x, x)));
+}
+
+/* inner-product of multiple vectors */
+static gmx_inline gmx_mm_pr
+gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
+ gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
+{
+ gmx_mm_pr ret;
+
+ ret = gmx_mul_pr(ax, bx);
+ ret = gmx_madd_pr(ay, by, ret);
+ ret = gmx_madd_pr(az, bz, ret);
+
+ return ret;
+}
+
+/* norm squared of multiple vectors */
+static gmx_inline gmx_mm_pr
+gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
+{
+ gmx_mm_pr ret;
+
+ ret = gmx_mul_pr(ax, ax);
+ ret = gmx_madd_pr(ay, ay, ret);
+ ret = gmx_madd_pr(az, az, ret);
+
+ return ret;
+}
+
+/* cross-product of multiple vectors */
+static gmx_inline void
+gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
+ gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
+ gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
+{
+ *cx = gmx_mul_pr(ay, bz);
+ *cx = gmx_nmsub_pr(az, by, *cx);
+
+ *cy = gmx_mul_pr(az, bx);
+ *cy = gmx_nmsub_pr(ax, bz, *cy);
+
+ *cz = gmx_mul_pr(ax, by);
+ *cz = gmx_nmsub_pr(ay, bx, *cz);
+}
+
+/* a + b + c + d (not really a vector operation, but where else put this?) */
+static gmx_inline gmx_mm_pr
+gmx_sum4_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d)
+{
+ return gmx_add_pr(gmx_add_pr(a, b), gmx_add_pr(c, d));
+}
+
+
+#endif /* _gmx_simd_vec_h_ */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _gmx_x86_simd_double_h_
-#define _gmx_x86_simd_double_h_
-
-/* This file includes the highest possible level of x86 (math) acceleration */
-
-#ifdef GMX_X86_AVX_256
-#include "gmx_x86_avx_256.h"
-#include "gmx_math_x86_avx_256_double.h"
-#else
-#ifdef GMX_X86_AVX_128_FMA
-#include "gmx_x86_avx_128_fma.h"
-#include "gmx_math_x86_avx_128_fma_double.h"
-#else
-#ifdef GMX_X86_SSE4_1
-#include "gmx_x86_sse4_1.h"
-#include "gmx_math_x86_sse4_1_double.h"
-#else
-#ifdef GMX_X86_SSE2
-#include "gmx_x86_sse2.h"
-#include "gmx_math_x86_sse2_double.h"
-#else
-#error No x86 acceleration defined
-#endif
-#endif
-#endif
-#endif
-
-static inline __m128d
-gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
-{
- return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
-}
-
-/* Normal sum of four __m128d registers */
-#define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
-
-#ifdef GMX_X86_AVX_256
-
-static inline __m256d
-gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
-{
- return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
-}
-
-/* Normal sum of four xmm registers */
-#define gmx_mm256_sum4_pd(t0, t1, t2, t3) _mm256_add_pd(_mm256_add_pd(t0, t1), _mm256_add_pd(t2, t3))
-
-#endif
-
-#endif /* _gmx_x86_simd_double_h_ */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef _gmx_x86_simd256_single_h_
-#define _gmx_x86_simd256_single_h_
-
-/* This file includes the highest possible level of x86 (math) acceleration */
-
-#ifdef GMX_X86_AVX_256
-#include "gmx_x86_avx_256.h"
-#include "gmx_math_x86_avx_256_single.h"
-#else
-#ifdef GMX_X86_AVX_128_FMA
-#include "gmx_x86_avx_128_fma.h"
-#include "gmx_math_x86_avx_128_fma_single.h"
-#else
-#ifdef GMX_X86_SSE4_1
-#include "gmx_x86_sse4_1.h"
-#include "gmx_math_x86_sse4_1_single.h"
-#else
-#ifdef GMX_X86_SSE2
-#include "gmx_x86_sse2.h"
-#include "gmx_math_x86_sse2_single.h"
-#else
-#error No x86 acceleration defined
-#endif
-#endif
-#endif
-#endif
-
-
-static inline __m128
-gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
-{
- return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
-}
-
-/* Normal sum of four __m128 registers */
-#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-
-#ifdef GMX_X86_AVX_256
-
-static inline __m256
-gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
-{
- return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
-}
-
-/* Normal sum of four __m256 registers */
-#define gmx_mm256_sum4_ps(t0, t1, t2, t3) _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
-
-#endif
-
-#endif /* _gmx_x86_simd256_single_h_ */
extern "C" {
#endif
+
+/* For testing the reference plain-C SIMD kernels, uncomment the next lines,
+ * as well as the GMX_SIMD_REFERENCE_PLAIN_C define in gmx_simd_macros.h
+ * The actual SIMD width is set in gmx_simd_macros.h
+ * The 4xN reference kernels support 2-, 4- and 8-way SIMD.
+ * The 2x(N+N) reference kernels support 8- and 16-way SIMD.
+ */
+/* #define GMX_NBNXN_SIMD */
+/* #define GMX_NBNXN_SIMD_4XN */
+/* #define GMX_NBNXN_SIMD_2XNN */
+
+
#ifdef GMX_X86_SSE2
/* Use SIMD accelerated nbnxn search and kernels */
#define GMX_NBNXN_SIMD
/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
-#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_NBNXN_SIMD_BITWIDTH 256
-#else
-#define GMX_NBNXN_SIMD_BITWIDTH 128
-#endif
-
/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
* Currently the 2xNN SIMD kernels only make sense with:
* 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
* 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
*/
#define GMX_NBNXN_SIMD_4XN
-#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#if defined GMX_X86_AVX_256 && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
#define GMX_NBNXN_SIMD_2XNN
#endif
*/
typedef void nbnxn_free_t (void *ptr);
+/* This is the actual cluster-pair list j-entry.
+ * cj is the j-cluster.
+ * The interaction bits in excl are indexed i-major, j-minor.
+ * The cj entries are sorted such that ones with exclusions come first.
+ * This means that once a full mask (=NBNXN_INTERACTION_MASK_ALL)
+ * is found, all subsequent j-entries in the i-entry also have full masks.
+ */
typedef struct {
- int cj; /* The j-cluster */
- unsigned excl; /* The exclusion (interaction) bits */
+ int cj; /* The j-cluster */
+ unsigned excl; /* The topology exclusion (interaction) bits */
} nbnxn_cj_t;
/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
} nbnxn_cj4_t;
typedef struct {
- unsigned pair[32]; /* Exclusion bits for one warp, *
- * each unsigned has bit for 4*8 i clusters */
+ unsigned pair[32]; /* Topology exclusion interaction bits for one warp,
+ * each unsigned has bitS for 4*8 i clusters
+ */
} nbnxn_excl_t;
typedef struct {
int xstride; /* stride for a coordinate in x (usually 3 or 4) */
int fstride; /* stride for a coordinate in f (usually 3 or 4) */
real *x; /* x and possibly q, size natoms*xstride */
- real *simd_4xn_diag; /* indices to set the SIMD 4xN diagonal masks */
- real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
- unsigned *simd_excl_mask; /* exclusion masks for SIMD topology exclusions */
+
+ /* j-atom minus i-atom index for generating self and Newton exclusions
+ * cluster-cluster pairs of the diagonal, for 4xn and 2xnn kernels.
+ */
+ real *simd_4xn_diagonal_j_minus_i;
+ real *simd_2xnn_diagonal_j_minus_i;
+ /* Filters for topology exclusion masks for the SIMD kernels.
+ * filter2 is the same as filter1, but with each element duplicated.
+ */
+ unsigned *simd_exclusion_filter1;
+ unsigned *simd_exclusion_filter2;
+
int nout; /* The number of force arrays */
nbnxn_atomdata_output_t *out; /* Output data structures */
int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
#include "force.h"
#include "nonbonded.h"
-#ifdef GMX_X86_SSE2
-#define SIMD_BONDEDS
-
+/* Include the SIMD macro file and then check for support */
#include "gmx_simd_macros.h"
+#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
+#define SIMD_BONDEDS
+#include "gmx_simd_vec.h"
#endif
/* Find a better place for this? */
#ifdef SIMD_BONDEDS
-/* Below are 3 SIMD vector operations.
- * Currently these are only used here, but they should be moved to
- * a general SIMD include file when used elsewhere.
- */
-
-/* SIMD inner-product of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_iprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
- gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz)
-{
- gmx_mm_pr ret;
-
- ret = gmx_mul_pr(ax, bx);
- ret = gmx_madd_pr(ay, by, ret);
- ret = gmx_madd_pr(az, bz, ret);
-
- return ret;
-}
-
-/* SIMD norm squared of multiple vectors */
-static gmx_inline gmx_mm_pr
-gmx_norm2_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az)
-{
- gmx_mm_pr ret;
-
- ret = gmx_mul_pr(ax, ax);
- ret = gmx_madd_pr(ay, ay, ret);
- ret = gmx_madd_pr(az, az, ret);
-
- return ret;
-}
-
-/* SIMD cross-product of multiple vectors */
-static gmx_inline void
-gmx_cprod_pr(gmx_mm_pr ax, gmx_mm_pr ay, gmx_mm_pr az,
- gmx_mm_pr bx, gmx_mm_pr by, gmx_mm_pr bz,
- gmx_mm_pr *cx, gmx_mm_pr *cy, gmx_mm_pr *cz)
-{
- *cx = gmx_mul_pr(ay, bz);
- *cx = gmx_nmsub_pr(az, by, *cx);
-
- *cy = gmx_mul_pr(az, bx);
- *cy = gmx_nmsub_pr(ax, bz, *cy);
-
- *cz = gmx_mul_pr(ax, by);
- *cz = gmx_nmsub_pr(ay, bx, *cz);
-}
-
/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
typedef struct {
gmx_mm_pr inv_bzz;
gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
gmx_mm_pr p_S, q_S;
gmx_mm_pr fmin_S = gmx_set1_pr(GMX_FLOAT_MIN);
- /* Using -0.0 should lead to only the sign bit being set */
- gmx_mm_pr sign_mask_S = gmx_set1_pr(-0.0);
for (s = 0; s < UNROLL; s++)
{
*nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
*nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
- /* Set sign of the angle with the sign of ipr_S.
- * Since phi is currently positive, we can use OR instead of XOR.
- */
- *phi_S = gmx_or_pr(*phi_S, gmx_and_pr(ipr_S, sign_mask_S));
+ /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
+ *phi_S = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
p_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
rkjx_S, rkjy_S, rkjz_S);
#include "calc_verletbuf.h"
#include "../mdlib/nbnxn_consts.h"
+#ifdef GMX_NBNXN_SIMD
+/* The include below sets the SIMD instruction type (precision+width)
+ * for all nbnxn SIMD search and non-bonded kernel code.
+ */
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
+#include "gmx_simd_macros.h"
+#endif
+
/* Struct for unique atom type for calculating the energy drift.
* The atom displacement depends on mass and constraints.
* The energy jump for given distance depend on LJ type and q.
#ifndef GMX_NBNXN_SIMD
list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
#else
- list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ list_setup->cluster_size_j = GMX_SIMD_WIDTH_HERE;
#ifdef GMX_NBNXN_SIMD_2XNN
/* We assume the smallest cluster size to be on the safe side */
list_setup->cluster_size_j /= 2;
const char *returnvalue = NULL;
switch (kernel_type)
{
- case nbnxnkNotSet: returnvalue = "not set"; break;
- case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
-#ifndef GMX_NBNXN_SIMD
- case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
-#else
+ case nbnxnkNotSet:
+ returnvalue = "not set";
+ break;
+ case nbnxnk4x4_PlainC:
+ returnvalue = "plain C";
+ break;
+ case nbnxnk4xN_SIMD_4xN:
+ case nbnxnk4xN_SIMD_2xNN:
+#ifdef GMX_NBNXN_SIMD
#ifdef GMX_X86_SSE2
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
- /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
- * on compiler flags. As we use nearly identical intrinsics, using an AVX
- * compiler flag without an AVX macro effectively results in AVX kernels.
+ /* We have x86 SSE2 compatible SIMD */
+#ifdef GMX_X86_AVX_128_FMA
+ returnvalue = "AVX-128-FMA";
+#else
+#if defined GMX_X86_AVX_256 || defined __AVX__
+ /* x86 SIMD intrinsics can be converted to SSE or AVX depending
+ * on compiler flags. As we use nearly identical intrinsics,
+ * compiling for AVX without an AVX macros effectively results
+ * in AVX kernels.
* For gcc we check for __AVX__
* At least a check for icc should be added (if there is a macro)
*/
-#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
-#ifndef GMX_X86_SSE4_1
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
+#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
+ returnvalue = "AVX-256";
#else
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
+ returnvalue = "AVX-128";
#endif
#else
- case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
-#endif
-#endif
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
- case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
+#ifdef GMX_X86_SSE4_1
+ returnvalue = "SSE4.1";
+#else
+ returnvalue = "SSE2";
#endif
-#else /* not GMX_X86_SSE2 */
- case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
- case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
#endif
#endif
+#else /* GMX_X86_SSE2 */
+ /* not GMX_X86_SSE2, but other SIMD */
+ returnvalue = "SIMD";
+#endif /* GMX_X86_SSE2 */
+#else /* GMX_NBNXN_SIMD */
+ returnvalue = "not available";
+#endif /* GMX_NBNXN_SIMD */
+ break;
case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
#ifdef GMX_NBNXN_SIMD
if (simple)
{
- /* Set the diagonal cluster pair exclusion mask setup data.
+ /* Set the diagonal cluster pair interaction mask setup data.
* In the kernel we check 0 < j - i to generate the masks.
- * Here we store j - i for generating the mask for the first i,
+ * Here we store j - i for generating the mask for the first i (i=0);
* we substract 0.5 to avoid rounding issues.
- * In the kernel we can subtract 1 to generate the subsequent mask.
+ * In the kernel we can subtract 1 to generate the mask for the next i.
*/
- const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
- int simd_4xn_diag_size, real_excl, simd_excl_size, j, s;
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ int simd_4xn_diag_ind_size, simd_interaction_size, j;
- simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
- snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_4xn_diag_size; j++)
+ simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
+ snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
+ simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_4xn_diag_ind_size; j++)
{
- nbat->simd_4xn_diag[j] = j - 0.5;
+ nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
}
- snew_aligned(nbat->simd_2xnn_diag, simd_width, NBNXN_MEM_ALIGN);
+ snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
+ simd_width, NBNXN_MEM_ALIGN);
for (j = 0; j < simd_width/2; j++)
{
/* The j-cluster size is half the SIMD width */
- nbat->simd_2xnn_diag[j] = j - 0.5;
+ nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
/* The next half of the SIMD width is for i + 1 */
- nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
+ nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
}
- /* We always use 32-bit integer exclusion masks. When we use
- * double precision, we fit two integers in a double SIMD register.
+ /* We use up to 32 bits for exclusion masking.
+ * The same masks are used for the 4xN and 2x(N+N) kernels.
+ * The masks are read either into epi32 SIMD registers or into
+ * real SIMD registers (together with a cast).
+ * In single precision this means the real and epi32 SIMD registers
+ * are of equal size.
+ * In double precision the epi32 registers can be smaller than
+ * the real registers, so depending on the architecture, we might
+ * need to use two, identical, 32-bit masks per real.
*/
- real_excl = sizeof(real)/sizeof(*nbat->simd_excl_mask);
- /* Set bits for use with both 4xN and 2x(N+N) kernels */
- simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width*real_excl;
- snew_aligned(nbat->simd_excl_mask, simd_excl_size*real_excl, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_excl_size; j++)
+ simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
+ snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size, NBNXN_MEM_ALIGN);
+ snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
+
+ for (j = 0; j < simd_interaction_size; j++)
{
- /* Set the consecutive bits for masking pair exclusions.
- * For double a single-bit mask would be enough.
- * But using two bits avoids endianness issues.
- */
- for (s = 0; s < real_excl; s++)
- {
- /* Set the consecutive bits for masking pair exclusions */
- nbat->simd_excl_mask[j*real_excl + s] = (1U << j);
- }
+ /* Set the consecutive bits for filters pair exclusions masks */
+ nbat->simd_exclusion_filter1[j] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
}
}
#endif
/* The SIMD width here is actually independent of that in the kernels,
* but we use the same width for simplicity (usually optimal anyhow).
*/
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gmx_simd_macros.h"
-
int i, s;
gmx_mm_pr dest_SSE, src_SSE;
#define X8_IND_A(a) (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
+/* Cluster-pair Interaction masks for 4xN and 2xNN kernels.
+ * Bit i*CJ_SIZE + j tells if atom i and j interact.
+ */
+/* All interaction mask is the same for all kernels */
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffff
+/* 4x4 kernel diagonal mask */
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ce
+/* 4x2 kernel diagonal masks */
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002F
+/* 4x8 kernel diagonal masks */
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfe
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0
+
+
#ifdef __cplusplus
}
#endif
#include "domdec.h"
#include "gmx_cyclecounter.h"
+#ifdef GMX_NBNXN_SIMD
+/* The include below sets the SIMD instruction type (precision+width)
+ * for all nbnxn SIMD search and non-bonded kernel code.
+ */
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
+#include "gmx_simd_macros.h"
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
#ifdef GMX_NBNXN_SIMD
/* Memory alignment in bytes as required by SIMD aligned loads/stores */
-#define NBNXN_MEM_ALIGN (GMX_NBNXN_SIMD_BITWIDTH/8)
+#define NBNXN_MEM_ALIGN (GMX_SIMD_WIDTH_HERE*sizeof(real))
#else
/* No alignment required, but set it so we can call the same routines */
#define NBNXN_MEM_ALIGN 32
} nbnxn_grid_t;
#ifdef GMX_NBNXN_SIMD
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gmx_simd_macros.h"
typedef struct nbnxn_x_ci_simd_4xn {
/* The i-cluster coordinates for simple search */
#ifdef GMX_NBNXN_SIMD_2XNN
-#include "nbnxn_kernel_simd_2xnn.h"
+/* Include the full width SIMD macros */
+#include "gmx_simd_macros.h"
+#include "gmx_simd_vec.h"
-/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
+#include "nbnxn_kernel_simd_2xnn.h"
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define GMX_MM128_HERE
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define GMX_MM256_HERE
-#else
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
-#endif
+#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)
+#error "unsupported SIMD width"
#endif
+
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
+
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
const real *VSvdw, const real *VSc,
real *Vvdw, real *Vc)
{
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
+ const int unrollj = GMX_SIMD_WIDTH_HERE/2;
+ const int unrollj_half = unrollj/2;
int ng_p2, i, j, j0, j1, c, s;
ng_p2 = (1<<ng_2log);
{
for (j0 = 0; j0 < ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
for (s = 0; s < unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += simd_width/2 + 2;
+ c += unrollj + 2;
}
}
}
/* Without exclusions and energies we only need to mask the cut-off,
* this can be faster with blendv.
*/
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
* With gcc this is slower, except for RF on Sandy Bridge.
* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
#ifdef CHECK_EXCLS
/* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr int_S0;
- gmx_mm_pr int_S2;
+ gmx_mm_pb interact_S0;
+ gmx_mm_pb interact_S2;
#endif
gmx_mm_pr jx_S, jy_S, jz_S;
gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
#ifndef CUTOFF_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_S0;
- gmx_mm_pr wco_S2;
+ gmx_mm_pb wco_S0;
+ gmx_mm_pb wco_S2;
#endif
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_S0;
+ gmx_mm_pb wco_vdw_S0;
#ifndef HALF_LJ
- gmx_mm_pr wco_vdw_S2;
+ gmx_mm_pb wco_vdw_S2;
#endif
#endif
#ifdef CALC_COULOMB
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
{
- /* Load integer interaction mask */
+ /* Load integer topology exclusion interaction mask */
+ gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
+
+ interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
+ interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
+ }
+#else
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
+ {
+ /* Integer mask set, cast to real and real mask operations */
gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
- int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
- int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
+ interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
+ interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
}
+#else
+#error "No SIMD bitmask operation available"
+#endif
#endif
+#endif /* CHECK_EXCLS */
/* load j atom coordinates */
- gmx_loaddh_pr(jx_S, x+ajx);
- gmx_loaddh_pr(jy_S, x+ajy);
- gmx_loaddh_pr(jz_S, x+ajz);
+ gmx_loaddh_pr(&jx_S, x+ajx);
+ gmx_loaddh_pr(&jy_S, x+ajy);
+ gmx_loaddh_pr(&jz_S, x+ajz);
/* Calculate distance */
dx_S0 = gmx_sub_pr(ix_S0, jx_S);
#if UNROLLJ == UNROLLI
if (cj == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag_S2);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2);
}
#else
#if UNROLLJ == 2*UNROLLI
if (cj*2 == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
}
else if (cj*2 + 1 == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
}
#else
#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
#endif
#else /* EXCL_FORCES */
/* No exclusion forces: remove all excluded atom pairs from the list */
- wco_S0 = gmx_and_pr(wco_S0, int_S0);
- wco_S2 = gmx_and_pr(wco_S2, int_S2);
+ wco_S0 = gmx_and_pb(wco_S0, interact_S0);
+ wco_S2 = gmx_and_pb(wco_S2, interact_S2);
#endif
#endif
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
- rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
+ rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
+ rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
#endif
/* Calculate 1/r */
#ifdef CALC_COULOMB
/* Load parameters for j atom */
- gmx_loaddh_pr(jq_S, q+aj);
+ gmx_loaddh_pr(&jq_S, q+aj);
qq_S0 = gmx_mul_pr(iq_S0, jq_S);
qq_S2 = gmx_mul_pr(iq_S2, jq_S);
#endif
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_S0, c12_S0);
+ load_lj_pair_params2(nbfp0, nbfp1, type, aj, &c6_S0, &c12_S0);
#ifndef HALF_LJ
- load_lj_pair_params2(nbfp2, nbfp3, type, aj, c6_S2, c12_S2);
+ load_lj_pair_params2(nbfp2, nbfp3, type, aj, &c6_S2, &c12_S2);
#endif
#endif /* not defined any LJ rule */
#ifdef LJ_COMB_GEOM
- gmx_loaddh_pr(c6s_j_S, ljc+aj2+0);
- gmx_loaddh_pr(c12s_j_S, ljc+aj2+STRIDE);
+ gmx_loaddh_pr(&c6s_j_S, ljc+aj2+0);
+ gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
#ifndef HALF_LJ
c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
#endif /* LJ_COMB_GEOM */
#ifdef LJ_COMB_LB
- gmx_loaddh_pr(hsig_j_S, ljc+aj2+0);
- gmx_loaddh_pr(seps_j_S, ljc+aj2+STRIDE);
+ gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
+ gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
#ifdef EXCL_FORCES
/* Only add 1/r for non-excluded atom pairs */
- rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
- rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
+ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0);
+ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2);
#else
/* No exclusion forces, we always need 1/r */
#define rinv_ex_S0 rinv_S0
#ifdef CALC_COUL_RF
/* Electrostatic interactions */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
#ifdef CALC_ENERGIES
vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
#endif
ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
#ifdef CALC_ENERGIES
vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
/* Truncate scaled r to an int */
ti_S0 = gmx_cvttpr_epi32(rs_S0);
ti_S2 = gmx_cvttpr_epi32(rs_S2);
-#ifdef GMX_HAVE_SIMD_FLOOR
+#ifdef GMX_SIMD_HAVE_FLOOR
rf_S0 = gmx_floor_pr(rs_S0);
rf_S2 = gmx_floor_pr(rs_S2);
#else
* Currently single precision uses FDV0, double F and V.
*/
#ifndef CALC_ENERGIES
- load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
- load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
+ load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
+ load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
#else
#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
+ load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
+ load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
#else
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
#endif
#endif
fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
#ifndef NO_SHIFT_EWALD
/* Add Ewald potential shift to vc_sub for convenience */
#ifdef CHECK_EXCLS
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
#else
vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
#ifndef LJ_COMB_LB
rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
#ifdef EXCL_FORCES
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0);
#endif
#ifndef HALF_LJ
rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
#ifdef EXCL_FORCES
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
#endif
sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
#ifdef EXCL_FORCES
- sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0);
#endif
#ifndef HALF_LJ
sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
#ifdef EXCL_FORCES
- sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
#endif
#ifdef CHECK_EXCLS
/* The potential shift should be removed for excluded pairs */
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0);
#ifndef HALF_LJ
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2);
#endif
#endif
#ifndef ENERGY_GROUPS
fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
/* Decrement j atom force */
- gmx_load_hpr(fjx_S, f+ajx);
- gmx_load_hpr(fjy_S, f+ajy);
- gmx_load_hpr(fjz_S, f+ajz);
+ gmx_load_hpr(&fjx_S, f+ajx);
+ gmx_load_hpr(&fjy_S, f+ajy);
+ gmx_load_hpr(&fjz_S, f+ajz);
gmx_store_hpr(f+ajx, gmx_sub_hpr(fjx_S, gmx_sum4_hpr(tx_S0, tx_S2)));
gmx_store_hpr(f+ajy, gmx_sub_hpr(fjy_S, gmx_sum4_hpr(ty_S0, ty_S2)));
gmx_store_hpr(f+ajz, gmx_sub_hpr(fjz_S, gmx_sum4_hpr(tz_S0, tz_S2)));
*/
-/* Include the full width SIMD macros */
-#include "gmx_simd_macros.h"
-
-
-/* Define a few macros for half-width SIMD */
-#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
-
-/* Half-width SIMD real type */
-#define gmx_mm_hpr __m128
-
-/* Half-width SIMD operations */
-/* Load reals at half-width aligned pointer b into half-width SIMD register a */
-#define gmx_load_hpr(a, b) a = _mm_load_ps(b)
-/* Load one real at pointer b into half-width SIMD register a */
-#define gmx_load1_hpr(a, b) a = _mm_load1_ps(b)
-/* Load one real at b and one real at b+1 into halves of a, respectively */
-#define gmx_load1p1_pr(a, b) a = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
-/* Load reals at half-width aligned pointer b into two halves of a */
-#define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
-/* To half-width SIMD register b into half width aligned memory a */
-#define gmx_store_hpr(a, b) _mm_store_ps(a, b)
-#define gmx_add_hpr _mm_add_ps
-#define gmx_sub_hpr _mm_sub_ps
-/* Horizontal sum over a half SIMD register */
-#define gmx_sum4_hpr gmx_mm256_sum4h_m128
-
-#else
-#error "Half-width SIMD macros are not yet defined"
-#endif
+/* Half-width SIMD operations are required here.
+ * As the 4xn kernels are the "standard" kernels and some special operations
+ * are required only here, we define those in nbnxn_kernel_simd_utils_...
+ *
+ * Half-width SIMD real type:
+ * gmx_mm_hpr
+ *
+ * Half-width SIMD operations
+ * Load reals at half-width aligned pointer b into half-width SIMD register a:
+ * gmx_load_hpr(a, b)
+ * Set all entries in half-width SIMD register *a to b:
+ * gmx_set1_hpr(a, b)
+ * Load one real at b and one real at b+1 into halves of a, respectively:
+ * gmx_load1p1_pr(a, b)
+ * Load reals at half-width aligned pointer b into two halves of a:
+ * gmx_loaddh_pr(a, b)
+ * Store half-width SIMD register b into half width aligned memory a:
+ * gmx_store_hpr(a, b)
+ * gmx_add_hpr(a, b)
+ * gmx_sub_hpr(a, b)
+ * Sum over 4 half SIMD registers:
+ * gmx_sum4_hpr(a, b)
+ * Sum the elements of halfs of each input register and store sums in out:
+ * gmx_mm_transpose_sum4h_pr(a, b)
+ * Extract two half-width registers *b, *c from a full width register a:
+ * gmx_pr_to_2hpr(a, b, c)
+ */
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define TAB_FDV0
#endif
+/* Currently stride 4 for the 2 LJ parameters is hard coded */
+#define NBFP_STRIDE 4
-#define SIMD_MASK_ALL 0xffffffff
#include "nbnxn_kernel_simd_utils.h"
gmx_mm_pr ix_S2, iy_S2, iz_S2;
gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
-#if UNROLLJ >= 4
-#ifndef GMX_DOUBLE
- __m128 fix_S, fiy_S, fiz_S;
-#else
- __m256d fix_S, fiy_S, fiz_S;
-#endif
-#else
- __m128d fix0_S, fiy0_S, fiz0_S;
- __m128d fix2_S, fiy2_S, fiz2_S;
-#endif
+ /* We use an i-force SIMD register width of 4 */
+ /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
+ gmx_mm_pr4 fix_S, fiy_S, fiz_S;
- gmx_mm_pr diag_jmi_S;
+ gmx_mm_pr diagonal_jmi_S;
#if UNROLLI == UNROLLJ
- gmx_mm_pr diag_S0, diag_S2;
+ gmx_mm_pb diagonal_mask_S0, diagonal_mask_S2;
#else
- gmx_mm_pr diag0_S0, diag0_S2;
- gmx_mm_pr diag1_S0, diag1_S2;
+ gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S2;
+ gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S2;
#endif
- gmx_mm_pr mask_S0, mask_S2;
+ unsigned *excl_filter;
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ gmx_epi32 filter_S0, filter_S2;
+#else
+ gmx_mm_pr filter_S0, filter_S2;
+#endif
gmx_mm_pr zero_S = gmx_set1_pr(0);
ljc = nbat->lj_comb;
#else
/* No combination rule used */
-#ifndef GMX_DOUBLE
+#if NBFP_STRIDE == 2
+ nbfp_ptr = nbat->nbfp;
+#else
+#if NBFP_STRIDE == 4
nbfp_ptr = nbat->nbfp_s4;
-#define NBFP_STRIDE 4
#else
- nbfp_ptr = nbat->nbfp;
-#define NBFP_STRIDE 2
+#error "Only NBFP_STRIDE 2 and 4 are currently supported"
+#endif
#endif
nbfp_stride = NBFP_STRIDE;
#endif
/* Load j-i for the first i */
- diag_jmi_S = gmx_load_pr(nbat->simd_2xnn_diag);
+ diagonal_jmi_S = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
/* Generate all the diagonal masks as comparison results */
#if UNROLLI == UNROLLJ
- diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
#else
#if 2*UNROLLI == UNROLLJ
- diag0_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag0_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag1_S0 = gmx_cmplt_pr(diag_i_S, diag_j_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag_i_S = gmx_add_pr(diag_i_S, one_S);
- diag1_S2 = gmx_cmplt_pr(diag_i_S, diag_j_S);
+ diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
#endif
#endif
/* Load masks for topology exclusion masking */
- mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*2*UNROLLJ);
- mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*2*UNROLLJ);
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
+#else
+#ifdef GMX_DOUBLE
+#define FILTER_STRIDE 2
+#else
+#define FILTER_STRIDE 1
+#endif
+#endif
+#if FILTER_STRIDE == 1
+ excl_filter = nbat->simd_exclusion_filter1;
+#else
+ excl_filter = nbat->simd_exclusion_filter2;
+#endif
+ /* Here we cast the exclusion filters from unsigned * to int * or real *.
+ * Since we only check bits, the actual value they represent does not
+ * matter, as long as both filter and mask data are treated the same way.
+ */
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
+ filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
+#else
+ filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
+ filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
+#endif
+#undef FILTER_STRIDE
#ifdef CALC_COUL_TAB
/* Generate aligned table index pointers */
#if UNROLLJ == 4
if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
#endif
-#if UNROLLJ == 2
- if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
-#endif
#if UNROLLJ == 8
if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
#endif
/* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
- gmx_load1p1_pr(ix_S0, x+scix);
- gmx_load1p1_pr(ix_S2, x+scix+2);
- gmx_load1p1_pr(iy_S0, x+sciy);
- gmx_load1p1_pr(iy_S2, x+sciy+2);
- gmx_load1p1_pr(iz_S0, x+sciz);
- gmx_load1p1_pr(iz_S2, x+sciz+2);
+ gmx_load1p1_pr(&ix_S0, x+scix);
+ gmx_load1p1_pr(&ix_S2, x+scix+2);
+ gmx_load1p1_pr(&iy_S0, x+sciy);
+ gmx_load1p1_pr(&iy_S2, x+sciy+2);
+ gmx_load1p1_pr(&iz_S0, x+sciz);
+ gmx_load1p1_pr(&iz_S2, x+sciz+2);
ix_S0 = gmx_add_pr(ix_S0, shX_S);
ix_S2 = gmx_add_pr(ix_S2, shX_S);
iy_S0 = gmx_add_pr(iy_S0, shY_S);
facel_S = gmx_set1_pr(facel);
- gmx_load1p1_pr(iq_S0, q+sci);
- gmx_load1p1_pr(iq_S2, q+sci+2);
+ gmx_load1p1_pr(&iq_S0, q+sci);
+ gmx_load1p1_pr(&iq_S2, q+sci+2);
iq_S0 = gmx_mul_pr(facel_S, iq_S0);
iq_S2 = gmx_mul_pr(facel_S, iq_S2);
}
#ifdef LJ_COMB_LB
- gmx_load1p1_pr(hsig_i_S0, ljc+sci2+0);
- gmx_load1p1_pr(hsig_i_S2, ljc+sci2+2);
- gmx_load1p1_pr(seps_i_S0, ljc+sci2+STRIDE+0);
- gmx_load1p1_pr(seps_i_S2, ljc+sci2+STRIDE+2);
+ gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
+ gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
+ gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
+ gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
#else
#ifdef LJ_COMB_GEOM
- gmx_load1p1_pr(c6s_S0, ljc+sci2+0);
+ gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
if (!half_LJ)
{
- gmx_load1p1_pr(c6s_S2, ljc+sci2+2);
+ gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
}
- gmx_load1p1_pr(c12s_S0, ljc+sci2+STRIDE+0);
+ gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
if (!half_LJ)
{
- gmx_load1p1_pr(c12s_S2, ljc+sci2+STRIDE+2);
+ gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
}
#else
nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
#define CALC_COULOMB
#define HALF_LJ
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_2xnn_inner.h"
cjind++;
{
#define CALC_COULOMB
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_2xnn_inner.h"
cjind++;
else
{
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_2xnn_inner.h"
cjind++;
ninner += cjind1 - cjind0;
/* Add accumulated i-forces to the force array */
-#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
-#define gmx_load_pr4 _mm_load_ps
-#define gmx_store_pr4 _mm_store_ps
-#define gmx_add_pr4 _mm_add_ps
-#else
-#error "You need to define 4-width SIM macros for i-force reduction"
-#endif
- GMX_MM_TRANSPOSE_SUM4H_PR(fix_S0, fix_S2, fix_S);
+ fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
- GMX_MM_TRANSPOSE_SUM4H_PR(fiy_S0, fiy_S2, fiy_S);
+ fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
- GMX_MM_TRANSPOSE_SUM4H_PR(fiz_S0, fiz_S2, fiz_S);
+ fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
#ifdef CALC_SHIFTFORCES
}
-#undef gmx_load_pr4
-#undef gmx_store_pr4
-#undef gmx_store_pr4
-
#undef CALC_SHIFTFORCES
#undef UNROLLI
#undef STRIDE
#undef TAB_FDV0
#undef NBFP_STRIDE
-
-#undef gmx_mm_hpr
-
-#undef gmx_load_hpr
-#undef gmx_load1_hpr
-#undef gmx_load1p1_pr
-#undef gmx_loaddh_pr
-#undef gmx_store_hpr
-#undef gmx_add_hpr
-#undef gmx_sub_hpr
-
-#undef gmx_sum4_hpr
#ifdef GMX_NBNXN_SIMD_4XN
-#include "nbnxn_kernel_simd_4xn.h"
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
+#include "gmx_simd_macros.h"
+#include "gmx_simd_vec.h"
-/* Include all flavors of the SSE or AVX 4xN kernel loops */
+#include "nbnxn_kernel_simd_4xn.h"
-#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)
+#error "unsupported SIMD width"
#endif
+
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
+
/* Analytical reaction-field kernels */
#define CALC_COUL_RF
const real *VSvdw, const real *VSc,
real *Vvdw, real *Vc)
{
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
+ const int unrollj = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = unrollj/2;
int ng_p2, i, j, j0, j1, c, s;
ng_p2 = (1<<ng_2log);
{
for (j0 = 0; j0 < ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+ c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
for (s = 0; s < unrollj_half; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
Vc [i*ng+j0] += VSc [c+0];
Vc [i*ng+j1] += VSc [c+1];
- c += simd_width + 2;
+ c += unrollj + 2;
}
}
}
* this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
* that selects from two SIMD registers based on the contents of a third.
*/
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_HAVE_SIMD_BLENDV && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
* With gcc this is slower, except for RF on Sandy Bridge.
* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
#ifdef CHECK_EXCLS
/* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pr int_S0;
- gmx_mm_pr int_S1;
- gmx_mm_pr int_S2;
- gmx_mm_pr int_S3;
+ gmx_mm_pb interact_S0;
+ gmx_mm_pb interact_S1;
+ gmx_mm_pb interact_S2;
+ gmx_mm_pb interact_S3;
#endif
gmx_mm_pr jx_S, jy_S, jz_S;
gmx_mm_pr rsq_S3, rinv_S3, rinvsq_S3;
#ifndef NBNXN_CUTOFF_USE_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pr wco_S0;
- gmx_mm_pr wco_S1;
- gmx_mm_pr wco_S2;
- gmx_mm_pr wco_S3;
+ gmx_mm_pb wco_S0;
+ gmx_mm_pb wco_S1;
+ gmx_mm_pb wco_S2;
+ gmx_mm_pb wco_S3;
#endif
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr wco_vdw_S0;
- gmx_mm_pr wco_vdw_S1;
+ gmx_mm_pb wco_vdw_S0;
+ gmx_mm_pb wco_vdw_S1;
#ifndef HALF_LJ
- gmx_mm_pr wco_vdw_S2;
- gmx_mm_pr wco_vdw_S3;
+ gmx_mm_pb wco_vdw_S2;
+ gmx_mm_pb wco_vdw_S3;
#endif
#endif
#ifdef CALC_COULOMB
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#ifdef gmx_checkbitmask_epi32
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
{
- /* Integer mask set and operations, cast result to real */
+ /* Load integer topology exclusion interaction mask */
gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
- int_S0 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S0));
- int_S1 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S1));
- int_S2 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S2));
- int_S3 = gmx_castsi_pr(gmx_checkbitmask_epi32(mask_pr_S, mask_S3));
+ interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
+ interact_S1 = gmx_checkbitmask_epi32(mask_pr_S, filter_S1);
+ interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
+ interact_S3 = gmx_checkbitmask_epi32(mask_pr_S, filter_S3);
}
#else
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
{
/* Integer mask set, cast to real and real mask operations */
gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
- int_S0 = gmx_checkbitmask_pr(mask_pr_S, mask_S0);
- int_S1 = gmx_checkbitmask_pr(mask_pr_S, mask_S1);
- int_S2 = gmx_checkbitmask_pr(mask_pr_S, mask_S2);
- int_S3 = gmx_checkbitmask_pr(mask_pr_S, mask_S3);
+ interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
+ interact_S1 = gmx_checkbitmask_pr(mask_pr_S, filter_S1);
+ interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
+ interact_S3 = gmx_checkbitmask_pr(mask_pr_S, filter_S3);
}
+#else
+#error "No SIMD bitmask operation available"
#endif
#endif
+#endif /* CHECK_EXCLS */
/* load j atom coordinates */
jx_S = gmx_load_pr(x+ajx);
#if UNROLLJ == UNROLLI
if (cj == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag_S3);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0);
+ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask_S1);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2);
+ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask_S3);
}
#else
#if UNROLLJ < UNROLLI
if (cj == ci_sh*2)
{
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
+ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3);
}
if (cj == ci_sh*2 + 1)
{
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
+ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3);
}
#else
if (cj*2 == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag0_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag0_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag0_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag0_S3);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
+ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask0_S1);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask0_S3);
}
else if (cj*2 + 1 == ci_sh)
{
- wco_S0 = gmx_and_pr(wco_S0, diag1_S0);
- wco_S1 = gmx_and_pr(wco_S1, diag1_S1);
- wco_S2 = gmx_and_pr(wco_S2, diag1_S2);
- wco_S3 = gmx_and_pr(wco_S3, diag1_S3);
+ wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
+ wco_S1 = gmx_and_pb(wco_S1, diagonal_mask1_S1);
+ wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+ wco_S3 = gmx_and_pb(wco_S3, diagonal_mask1_S3);
}
#endif
#endif
#else /* EXCL_FORCES */
/* No exclusion forces: remove all excluded atom pairs from the list */
- wco_S0 = gmx_and_pr(wco_S0, int_S0);
- wco_S1 = gmx_and_pr(wco_S1, int_S1);
- wco_S2 = gmx_and_pr(wco_S2, int_S2);
- wco_S3 = gmx_and_pr(wco_S3, int_S3);
+ wco_S0 = gmx_and_pb(wco_S0, interact_S0);
+ wco_S1 = gmx_and_pb(wco_S1, interact_S1);
+ wco_S2 = gmx_and_pb(wco_S2, interact_S2);
+ wco_S3 = gmx_and_pb(wco_S3, interact_S3);
#endif
#endif
tmp = gmx_simd_align_real(tmpa);
for (i = 0; i < UNROLLI; i++)
{
- gmx_store_pr(tmp, i == 0 ? wco_S0 : (i == 1 ? wco_S1 : (i == 2 ? wco_S2 : wco_S3)));
+ gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
for (j = 0; j < UNROLLJ; j++)
{
- if (!(tmp[j] == 0))
+ if (tmp[j] >= 0)
{
npair++;
}
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_S0 = gmx_add_pr(rsq_S0, gmx_andnot_pr(int_S0, avoid_sing_S));
- rsq_S1 = gmx_add_pr(rsq_S1, gmx_andnot_pr(int_S1, avoid_sing_S));
- rsq_S2 = gmx_add_pr(rsq_S2, gmx_andnot_pr(int_S2, avoid_sing_S));
- rsq_S3 = gmx_add_pr(rsq_S3, gmx_andnot_pr(int_S3, avoid_sing_S));
+ rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
+ rsq_S1 = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
+ rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+ rsq_S3 = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
#endif
/* Calculate 1/r */
rinv_S2 = gmx_invsqrt_pr(rsq_S2);
rinv_S3 = gmx_invsqrt_pr(rsq_S3);
#else
- GMX_MM_INVSQRT2_PD(rsq_S0, rsq_S1, rinv_S0, rinv_S1);
- GMX_MM_INVSQRT2_PD(rsq_S2, rsq_S3, rinv_S2, rinv_S3);
+ gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
+ gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
#endif
#ifdef CALC_COULOMB
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
- load_lj_pair_params(nbfp0, type, aj, c6_S0, c12_S0);
- load_lj_pair_params(nbfp1, type, aj, c6_S1, c12_S1);
+ load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0);
+ load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1);
#ifndef HALF_LJ
- load_lj_pair_params(nbfp2, type, aj, c6_S2, c12_S2);
- load_lj_pair_params(nbfp3, type, aj, c6_S3, c12_S3);
+ load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2);
+ load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3);
#endif
#endif /* not defined any LJ rule */
#ifdef EXCL_FORCES
/* Only add 1/r for non-excluded atom pairs */
- rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, int_S0);
- rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, int_S1);
- rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, int_S2);
- rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, int_S3);
+ rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0);
+ rinv_ex_S1 = gmx_blendzero_pr(rinv_S1, interact_S1);
+ rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2);
+ rinv_ex_S3 = gmx_blendzero_pr(rinv_S3, interact_S3);
#else
/* No exclusion forces, we always need 1/r */
#define rinv_ex_S0 rinv_S0
#ifdef CALC_COUL_RF
/* Electrostatic interactions */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(rsq_S0, mrc_3_S)));
- frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(rsq_S1, mrc_3_S)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(rsq_S2, mrc_3_S)));
- frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(rsq_S3, mrc_3_S)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3));
#ifdef CALC_ENERGIES
vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
ewcorr_S1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
ewcorr_S3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_mul_pr(ewcorr_S0, brsq_S0)));
- frcoul_S1 = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_mul_pr(ewcorr_S1, brsq_S1)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_mul_pr(ewcorr_S2, brsq_S2)));
- frcoul_S3 = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_mul_pr(ewcorr_S3, brsq_S3)));
+ frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
+ frcoul_S1 = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1));
+ frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
+ frcoul_S3 = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3));
#ifdef CALC_ENERGIES
vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
ti_S1 = gmx_cvttpr_epi32(rs_S1);
ti_S2 = gmx_cvttpr_epi32(rs_S2);
ti_S3 = gmx_cvttpr_epi32(rs_S3);
-#ifdef GMX_HAVE_SIMD_FLOOR
+#ifdef GMX_SIMD_HAVE_FLOOR
/* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
rf_S0 = gmx_floor_pr(rs_S0);
rf_S1 = gmx_floor_pr(rs_S1);
* Currently single precision uses FDV0, double F and V.
*/
#ifndef CALC_ENERGIES
- load_table_f(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0);
- load_table_f(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1);
- load_table_f(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2);
- load_table_f(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3);
+ load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
+ load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1);
+ load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
+ load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3);
#else
#ifdef TAB_FDV0
- load_table_f_v(tab_coul_F, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
- load_table_f_v(tab_coul_F, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
- load_table_f_v(tab_coul_F, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
+ load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
+ load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
+ load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+ load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
#else
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, ctab0_S0, ctab1_S0, ctabv_S0);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, ctab0_S1, ctab1_S1, ctabv_S1);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, ctab0_S2, ctab1_S2, ctabv_S2);
- load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, ctab0_S3, ctab1_S3, ctabv_S3);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
+ load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
#endif
#endif
fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
#ifndef NO_SHIFT_EWALD
/* Add Ewald potential shift to vc_sub for convenience */
#ifdef CHECK_EXCLS
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, int_S0));
- vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, int_S1));
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, int_S2));
- vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, int_S3));
+ vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
+ vc_sub_S1 = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1));
+ vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
+ vc_sub_S3 = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3));
#else
vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
vc_sub_S1 = gmx_add_pr(vc_sub_S1, sh_ewald_S);
rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
rinvsix_S1 = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
#ifdef EXCL_FORCES
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, int_S0);
- rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, int_S1);
+ rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0);
+ rinvsix_S1 = gmx_blendzero_pr(rinvsix_S1, interact_S1);
#endif
#ifndef HALF_LJ
rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
rinvsix_S3 = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
#ifdef EXCL_FORCES
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, int_S2);
- rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, int_S3);
+ rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2);
+ rinvsix_S3 = gmx_blendzero_pr(rinvsix_S3, interact_S3);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
sir6_S1 = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
#ifdef EXCL_FORCES
- sir6_S0 = gmx_blendzero_pr(sir6_S0, int_S0);
- sir6_S1 = gmx_blendzero_pr(sir6_S1, int_S1);
+ sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0);
+ sir6_S1 = gmx_blendzero_pr(sir6_S1, interact_S1);
#endif
#ifndef HALF_LJ
sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
sir6_S3 = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
#ifdef EXCL_FORCES
- sir6_S2 = gmx_blendzero_pr(sir6_S2, int_S2);
- sir6_S3 = gmx_blendzero_pr(sir6_S3, int_S3);
+ sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2);
+ sir6_S3 = gmx_blendzero_pr(sir6_S3, interact_S3);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
#endif
#ifdef CHECK_EXCLS
/* The potential shift should be removed for excluded pairs */
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, int_S0);
- VLJ_S1 = gmx_blendzero_pr(VLJ_S1, int_S1);
+ VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0);
+ VLJ_S1 = gmx_blendzero_pr(VLJ_S1, interact_S1);
#ifndef HALF_LJ
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, int_S2);
- VLJ_S3 = gmx_blendzero_pr(VLJ_S3, int_S3);
+ VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2);
+ VLJ_S3 = gmx_blendzero_pr(VLJ_S3, interact_S3);
#endif
#endif
#ifndef ENERGY_GROUPS
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
-#endif
-
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gmx_simd_macros.h"
-
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#endif
#endif
+/* Decide the stride for the 2 LJ parameters */
+#ifdef GMX_X86_SSE2
+#ifdef GMX_DOUBLE
+#define NBFP_STRIDE 2
+#else
+#define NBFP_STRIDE 4
+#endif
+#else
+#if GMX_SIMD_WIDTH_HERE > 4
+#define NBFP_STRIDE 4
+#else
+#define NBFP_STRIDE GMX_SIMD_WIDTH_HERE
+#endif
+#endif
-#define SIMD_MASK_ALL 0xffffffff
#include "nbnxn_kernel_simd_utils.h"
gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
#if UNROLLJ >= 4
-#ifndef GMX_DOUBLE
- __m128 fix_S, fiy_S, fiz_S;
+ /* We use an i-force SIMD register width of 4 */
+#if UNROLLJ == 4
+#define gmx_mm_pr4 gmx_mm_pr
+#define gmx_load_pr4 gmx_load_pr
+#define gmx_store_pr4 gmx_store_pr
+#define gmx_add_pr4 gmx_add_pr
#else
- __m256d fix_S, fiy_S, fiz_S;
+ /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
#endif
+ gmx_mm_pr4 fix_S, fiy_S, fiz_S;
#else
- __m128d fix0_S, fiy0_S, fiz0_S;
- __m128d fix2_S, fiy2_S, fiz2_S;
+ /* We use an i-force SIMD register width of 2 */
+ gmx_mm_pr fix0_S, fiy0_S, fiz0_S;
+ gmx_mm_pr fix2_S, fiy2_S, fiz2_S;
#endif
- gmx_mm_pr diag_jmi_S;
+ gmx_mm_pr diagonal_jmi_S;
#if UNROLLI == UNROLLJ
- gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
+ gmx_mm_pb diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
#else
- gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
- gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
+ gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
+ gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
#endif
-#ifdef gmx_checkbitmask_epi32
- gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
+ unsigned *excl_filter;
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ gmx_epi32 filter_S0, filter_S1, filter_S2, filter_S3;
#else
- gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
+ gmx_mm_pr filter_S0, filter_S1, filter_S2, filter_S3;
#endif
gmx_mm_pr zero_S = gmx_set1_pr(0);
#ifndef TAB_FDV0
const real *tab_coul_V;
#endif
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#if GMX_SIMD_WIDTH_HERE >= 8 || (defined GMX_DOUBLE && GMX_SIMD_WIDTH_HERE >= 4)
+#define STORE_TABLE_INDICES
+#endif
+#ifdef STORE_TABLE_INDICES
int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
+#else
+ /* Table indices not used, but a function requires the argument */
+ int *ti0 = NULL, *ti1 = NULL, *ti2 = NULL, *ti3 = NULL;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_S;
ljc = nbat->lj_comb;
#else
/* No combination rule used */
-#ifndef GMX_DOUBLE
+#if NBFP_STRIDE == 2
+ nbfp_ptr = nbat->nbfp;
+#else
+#if NBFP_STRIDE == 4
nbfp_ptr = nbat->nbfp_s4;
-#define NBFP_STRIDE 4
#else
- nbfp_ptr = nbat->nbfp;
-#define NBFP_STRIDE 2
+#error "Only NBFP_STRIDE 2 and 4 are currently supported"
+#endif
#endif
nbfp_stride = NBFP_STRIDE;
#endif
/* Load j-i for the first i */
- diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
+ diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
/* Generate all the diagonal masks as comparison results */
#if UNROLLI == UNROLLJ
- diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
#else
#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
- diag0_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag0_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
#if UNROLLI == 2*UNROLLJ
/* Load j-i for the second half of the j-cluster */
- diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+ diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
#endif
- diag1_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
- diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
- diag1_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
+ diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
+ diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
#endif
#endif
/* Load masks for topology exclusion masking */
-#ifdef gmx_checkbitmask_epi32
- mask_S0 = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S1 = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S2 = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
- mask_S3 = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
+#else
+#ifdef GMX_DOUBLE
+#define FILTER_STRIDE 2
+#else
+#define FILTER_STRIDE 1
+#endif
+#endif
+#if FILTER_STRIDE == 1
+ excl_filter = nbat->simd_exclusion_filter1;
#else
- mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
- mask_S1 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
- mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
- mask_S3 = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
+ excl_filter = nbat->simd_exclusion_filter2;
+#endif
+ /* Here we cast the exclusion filters from unsigned * to int * or real *.
+ * Since we only check bits, the actual value they represent does not
+ * matter, as long as both filter and mask data are treated the same way.
+ */
+#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
+ filter_S0 = gmx_load_si((int *)excl_filter + 0*UNROLLJ*FILTER_STRIDE);
+ filter_S1 = gmx_load_si((int *)excl_filter + 1*UNROLLJ*FILTER_STRIDE);
+ filter_S2 = gmx_load_si((int *)excl_filter + 2*UNROLLJ*FILTER_STRIDE);
+ filter_S3 = gmx_load_si((int *)excl_filter + 3*UNROLLJ*FILTER_STRIDE);
+#else
+ filter_S0 = gmx_load_pr((real *)excl_filter + 0*UNROLLJ);
+ filter_S1 = gmx_load_pr((real *)excl_filter + 1*UNROLLJ);
+ filter_S2 = gmx_load_pr((real *)excl_filter + 2*UNROLLJ);
+ filter_S3 = gmx_load_pr((real *)excl_filter + 3*UNROLLJ);
#endif
+#undef FILTER_STRIDE
#ifdef CALC_COUL_TAB
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef STORE_TABLE_INDICES
/* Generate aligned table index pointers */
ti0 = gmx_simd_align_int(ti0_array);
ti1 = gmx_simd_align_int(ti1_array);
#define CALC_COULOMB
#define HALF_LJ
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
{
#define CALC_COULOMB
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
else
{
#define CHECK_EXCLS
- while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+ while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
{
#include "nbnxn_kernel_simd_4xn_inner.h"
cjind++;
/* Add accumulated i-forces to the force array */
#if UNROLLJ >= 4
-#ifndef GMX_DOUBLE
-#define gmx_load_pr4 _mm_load_ps
-#define gmx_store_pr4 _mm_store_ps
-#define gmx_add_pr4 _mm_add_ps
-#else
-#define gmx_load_pr4 _mm256_load_pd
-#define gmx_store_pr4 _mm256_store_pd
-#define gmx_add_pr4 _mm256_add_pd
-#endif
- GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
+ fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
+ fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
+ fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
#ifdef CALC_SHIFTFORCES
fshift[ish3+2] += SUM_SIMD4(shf);
#endif
#else
- GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
- _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
- GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
- _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
+ fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
+ gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
+ fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
+ gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
- _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
- _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
+ fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
+ gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
+ fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
+ gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
- _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
- _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
+ fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
+ gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
+ fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
+ gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
#ifdef CALC_SHIFTFORCES
- _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
+ gmx_store_pr(shf, gmx_add_pr(fix0_S, fix2_S));
fshift[ish3+0] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
+ gmx_store_pr(shf, gmx_add_pr(fiy0_S, fiy2_S));
fshift[ish3+1] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
+ gmx_store_pr(shf, gmx_add_pr(fiz0_S, fiz2_S));
fshift[ish3+2] += shf[0] + shf[1];
#endif
#endif
}
+#if UNROLLJ == 4
+#undef gmx_mm_pr4
#undef gmx_load_pr4
#undef gmx_store_pr4
#undef gmx_store_pr4
+#endif
+
+#undef STORE_TABLE_INDICES
#undef CALC_SHIFTFORCES
* energy group pair energy storage
*/
-#ifdef GMX_X86_SSE2
-
-/* Transpose 2 double precision registers */
-#define GMX_MM_TRANSPOSE2_OP_PD(in0, in1, out0, out1) \
- { \
- out0 = _mm_unpacklo_pd(in0, in1); \
- out1 = _mm_unpackhi_pd(in0, in1); \
- }
-#if GMX_NBNXN_SIMD_BITWIDTH == 128 || !defined GMX_DOUBLE
-/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1) \
- { \
- __m128 _c01, _c23; \
- _c01 = _mm_movelh_ps(in0, in1); \
- _c23 = _mm_movelh_ps(in2, in3); \
- out0 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- out1 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1)); \
- }
+/* Include SIMD architecture specific versions of the 4/5 functions above */
+#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+#include "nbnxn_kernel_simd_utils_ref.h"
#else
-/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0, in1, in2, in3, out0, out1) \
- { \
- __m256d _c01, _c23; \
- _c01 = _mm256_shuffle_pd(in0, in1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _c23 = _mm256_shuffle_pd(in2, in3, _MM_SHUFFLE(1, 0, 1, 0)); \
- out0 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- out1 = _mm256_shuffle_pd(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1)); \
- }
-#endif
-
-/* Collect element 2 of the 4 inputs to out */
-#define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0, in1, in2, in3, out) \
- { \
- __m128 _c01, _c23; \
- _c01 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _c23 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 2, 3, 2)); \
- out = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0)); \
- }
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#ifndef GMX_DOUBLE
-/* Sum the elements within each input register and store the sums in out */
-#define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- _MM_TRANSPOSE4_PS(in0, in1, in2, in3); \
- in0 = _mm_add_ps(in0, in1); \
- in2 = _mm_add_ps(in2, in3); \
- out = _mm_add_ps(in0, in2); \
- }
+#ifdef GMX_X86_SSE2
+/* Include x86 SSE2 compatible SIMD functions */
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+#ifdef GMX_DOUBLE
+#include "nbnxn_kernel_simd_utils_x86_256d.h"
#else
-/* Sum the elements within each input register and store the sums in out */
-#define GMX_MM_TRANSPOSE_SUM2_PD(in0, in1, out) \
- { \
- GMX_MM_TRANSPOSE2_PD(in0, in1); \
- out = _mm_add_pd(in0, in1); \
- }
+#include "nbnxn_kernel_simd_utils_x86_256s.h"
#endif
#else
-#ifndef GMX_DOUBLE
-/* Sum the elements within each input register and store the sums in out */
-#define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- in0 = _mm256_hadd_ps(in0, in1); \
- in2 = _mm256_hadd_ps(in2, in3); \
- in1 = _mm256_hadd_ps(in0, in2); \
- out = _mm_add_ps(_mm256_castps256_ps128(in1), _mm256_extractf128_ps(in1, 1)); \
- }
-/* Sum the elements of halfs of each input register and store sums in out */
-#define GMX_MM_TRANSPOSE_SUM4H_PR(in0, in2, out) \
- { \
- in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps()); \
- in2 = _mm256_hadd_ps(in2, _mm256_setzero_ps()); \
- in0 = _mm256_hadd_ps(in0, in2); \
- in2 = _mm256_permute_ps(in0, _MM_SHUFFLE(2, 3, 0, 1)); \
- out = _mm_add_ps(_mm256_castps256_ps128(in0), _mm256_extractf128_ps(in2, 1)); \
- }
+#ifdef GMX_DOUBLE
+#include "nbnxn_kernel_simd_utils_x86_128d.h"
#else
-/* Sum the elements within each input register and store the sums in out */
-#define GMX_MM_TRANSPOSE_SUM4_PR(in0, in1, in2, in3, out) \
- { \
- in0 = _mm256_hadd_pd(in0, in1); \
- in2 = _mm256_hadd_pd(in2, in3); \
- out = _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31)); \
- }
-#endif
-#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-
-static inline __m128
-gmx_mm128_invsqrt_ps_single(__m128 x)
-{
- const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
- const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-/* Do 2 double precision invsqrt operations.
- * Doing the SIMD rsqrt and the first Newton Raphson iteration
- * in single precision gives full double precision accuracy.
- * The speed is more than double that of two gmx_mm_invsqrt_pd calls.
- */
-#define GMX_MM128_INVSQRT2_PD(in0, in1, out0, out1) \
- { \
- const __m128d half = _mm_set1_pd(0.5); \
- const __m128d three = _mm_set1_pd(3.0); \
- __m128 s, ir; \
- __m128d lu0, lu1; \
- \
- s = _mm_movelh_ps(_mm_cvtpd_ps(in0), _mm_cvtpd_ps(in1)); \
- ir = gmx_mm128_invsqrt_ps_single(s); \
- lu0 = _mm_cvtps_pd(ir); \
- lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir, ir)); \
- out0 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu0, lu0), in0)), lu0)); \
- out1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), in1)), lu1)); \
- }
-
-#define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
-
-#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-
-static inline __m256
-gmx_mm256_invsqrt_ps_single(__m256 x)
-{
- const __m256 half = _mm256_set_ps(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
- const __m256 three = _mm256_set_ps(3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0);
-
- __m256 lu = _mm256_rsqrt_ps(x);
-
- return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
-}
-
-/* Do 4 double precision invsqrt operations.
- * Doing the SIMD rsqrt and the first Newton Raphson iteration
- * in single precision gives full double precision accuracy.
- */
-#define GMX_MM256_INVSQRT2_PD(in0, in1, out0, out1) \
- { \
- const __m256d half = _mm256_set1_pd(0.5); \
- const __m256d three = _mm256_set1_pd(3.0); \
- __m256 s, ir; \
- __m256d lu0, lu1; \
- \
- s = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(in0)), _mm256_cvtpd_ps(in1), 1); \
- ir = gmx_mm256_invsqrt_ps_single(s); \
- lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir)); \
- lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1)); \
- out0 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu0, lu0), in0)), lu0)); \
- out1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), in1)), lu1)); \
- }
-
-#define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
-
-#endif
-
-/* Force and energy table load and interpolation routines */
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
-
-#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- gmx_mm_pr clj_SSE[UNROLLJ]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6_SSE, c12_SSE); \
- }
-
-#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
-
-/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
-#define GMX_2_MM_TO_M256(in0, in1, out) \
- { \
- out = _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1); \
- }
-
-#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128 clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0], clj_SSE[1], clj_SSE[2], clj_SSE[3], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4], clj_SSE[5], clj_SSE[6], clj_SSE[7], c6t_SSE[1], c12t_SSE[1]); \
- \
- GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
-#define load_lj_pair_params2(nbfp0, nbfp1, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128 clj_SSE0[UNROLLJ], clj_SSE1[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE); \
- } \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- /* Here we load 4 aligned floats, but we need just 2 */ \
- clj_SSE1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE0[0], clj_SSE0[1], clj_SSE0[2], clj_SSE0[3], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE1[0], clj_SSE1[1], clj_SSE1[2], clj_SSE1[3], c6t_SSE[1], c12t_SSE[1]); \
- \
- GMX_2_MM_TO_M256(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_MM_TO_M256(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
+#include "nbnxn_kernel_simd_utils_x86_128s.h"
#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
-
-#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- gmx_mm_pr clj_SSE[UNROLLJ]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6_SSE, c12_SSE); \
- }
-
#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
-
-#define load_lj_pair_params(nbfp, type, aj, c6_SSE, c12_SSE) \
- { \
- __m128d clj_SSE[UNROLLJ], c6t_SSE[2], c12t_SSE[2]; \
- int p; \
- \
- for (p = 0; p < UNROLLJ; p++) \
- { \
- clj_SSE[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE); \
- } \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0], clj_SSE[1], c6t_SSE[0], c12t_SSE[0]); \
- GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[2], clj_SSE[3], c6t_SSE[1], c12t_SSE[1]); \
- GMX_2_M128D_TO_M256D(c6t_SSE[0], c6t_SSE[1], c6_SSE); \
- GMX_2_M128D_TO_M256D(c12t_SSE[0], c12t_SSE[1], c12_SSE); \
- }
-
#endif
-
-
-/* The load_table functions below are performance critical.
- * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
- * As these all have latencies, scheduling is crucial.
- * The Intel compilers and CPUs seem to do a good job at this.
- * But AMD CPUs perform significantly worse with gcc than with icc.
- * Performance is improved a bit by using the extract function UNROLLJ times,
- * instead of doing an _mm_store_si128 for every i-particle.
- * This is only faster when we use FDV0 formatted tables, where we also need
- * to multiple the index by 4, which can be done by a SIMD bit shift.
- * With single precision AVX, 8 extracts are much slower than 1 store.
- * Because of this, the load_table_f macro always takes the ti parameter,
- * but it is only used with AVX.
- */
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128 && !defined GMX_DOUBLE
-
-#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- int idx[4]; \
- __m128 ctab_SSE[4]; \
- \
- /* Table has 4 entries, left-shift index by 2 */ \
- ti_SSE = _mm_slli_epi32(ti_SSE, 2); \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
- idx[2] = gmx_mm_extract_epi32(ti_SSE, 2); \
- ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
- idx[3] = gmx_mm_extract_epi32(ti_SSE, 3); \
- ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
- }
-
-#define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- int idx[4]; \
- __m128 ctab_SSE[4]; \
- \
- /* Table has 4 entries, left-shift index by 2 */ \
- ti_SSE = _mm_slli_epi32(ti_SSE, 2); \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]); \
- idx[2] = gmx_mm_extract_epi32(ti_SSE, 2); \
- ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]); \
- idx[3] = gmx_mm_extract_epi32(ti_SSE, 3); \
- ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctab0_SSE, ctab1_SSE); \
- /* Shuffle the energy table entries to a convenient order */ \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabv_SSE); \
- }
-
#endif
-#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
-
-#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- __m128 ctab_SSE[8], ctabt_SSE[4]; \
- int j; \
- \
- /* Bit shifting would be faster, but AVX doesn't support that */ \
- _mm256_store_si256((__m256i *)ti, ti_SSE); \
- for (j = 0; j < 8; j++) \
- { \
- ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
- \
- GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE); \
- GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE); \
- }
-#define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- __m128 ctab_SSE[8], ctabt_SSE[4], ctabvt_SSE[2]; \
- int j; \
- \
- /* Bit shifting would be faster, but AVX doesn't support that */ \
- _mm256_store_si256((__m256i *)ti, ti_SSE); \
- for (j = 0; j < 8; j++) \
- { \
- ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4); \
- } \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabt_SSE[0], ctabt_SSE[2]); \
- GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabt_SSE[1], ctabt_SSE[3]); \
- \
- GMX_2_MM_TO_M256(ctabt_SSE[0], ctabt_SSE[1], ctab0_SSE); \
- GMX_2_MM_TO_M256(ctabt_SSE[2], ctabt_SSE[3], ctab1_SSE); \
- \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0], ctab_SSE[1], ctab_SSE[2], ctab_SSE[3], ctabvt_SSE[0]); \
- GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[4], ctab_SSE[5], ctab_SSE[6], ctab_SSE[7], ctabvt_SSE[1]); \
- \
- GMX_2_MM_TO_M256(ctabvt_SSE[0], ctabvt_SSE[1], ctabv_SSE); \
- }
-
-#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 128 && defined GMX_DOUBLE
-
-#define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- int idx[2]; \
- __m128d ctab_SSE[2]; \
- \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE); \
- }
-
-#define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- int idx[2]; \
- __m128d ctab_SSE[4]; \
- \
- /* Without SSE4.1 the extract macro needs an immediate: unroll */ \
- idx[0] = gmx_mm_extract_epi32(ti_SSE, 0); \
- ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]); \
- idx[1] = gmx_mm_extract_epi32(ti_SSE, 1); \
- ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]); \
- \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], ctab0_SSE, ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm_sub_pd(ctab1_SSE, ctab0_SSE); \
- \
- ctab_SSE[2] = _mm_loadu_pd(tab_coul_V+idx[0]); \
- ctab_SSE[3] = _mm_loadu_pd(tab_coul_V+idx[1]); \
- \
- /* Shuffle the energy table entries to a single register */ \
- ctabv_SSE = _mm_shuffle_pd(ctab_SSE[2], ctab_SSE[3], _MM_SHUFFLE2(0, 0)); \
- }
-
-#endif
-
-#if GMX_NBNXN_SIMD_BITWIDTH == 256 && defined GMX_DOUBLE
-
-/* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
-#define GMX_2_M128D_TO_M256D(in0, in1, out) \
- { \
- out = _mm256_insertf128_pd(_mm256_castpd128_pd256(in0), in1, 1); \
- }
-
-#define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE) \
- { \
- __m128d ctab_SSE[4], tr_SSE[4]; \
- int j; \
- \
- _mm_store_si128((__m128i *)ti, ti_SSE); \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
- } \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
- GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE); \
- GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE); \
- }
-
-#define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
- { \
- __m128d ctab_SSE[8], tr_SSE[4]; \
- int j; \
- \
- _mm_store_si128((__m128i *)ti, ti_SSE); \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]); \
- } \
- /* Shuffle the force table entries to a convenient order */ \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0], ctab_SSE[1], tr_SSE[0], tr_SSE[1]); \
- GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2], ctab_SSE[3], tr_SSE[2], tr_SSE[3]); \
- GMX_2_M128D_TO_M256D(tr_SSE[0], tr_SSE[2], ctab0_SSE); \
- GMX_2_M128D_TO_M256D(tr_SSE[1], tr_SSE[3], ctab1_SSE); \
- /* The second force table entry should contain the difference */ \
- ctab1_SSE = _mm256_sub_pd(ctab1_SSE, ctab0_SSE); \
- \
- for (j = 0; j < 4; j++) \
- { \
- ctab_SSE[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]); \
- } \
- /* Shuffle the energy table entries to a single register */ \
- GMX_2_M128D_TO_M256D(_mm_shuffle_pd(ctab_SSE[4], ctab_SSE[5], _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(ctab_SSE[6], ctab_SSE[7], _MM_SHUFFLE2(0, 0)), ctabv_SSE); \
- }
-
-#endif
-
-
-/* Add energy register to possibly multiple terms in the energy array.
- * This function is the same for SSE/AVX single/double.
- */
-static inline void add_ener_grp(gmx_mm_pr e_SSE, real *v, const int *offset_jj)
+#ifdef UNROLLJ
+/* Add energy register to possibly multiple terms in the energy array */
+static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
{
int jj;
*/
for (jj = 0; jj < (UNROLLJ/2); jj++)
{
- gmx_mm_pr v_SSE;
+ gmx_mm_pr v_S;
- v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
- gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_SSE, e_SSE));
+ v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
}
}
+#endif
-#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8 && defined gmx_mm_hpr
-/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
+/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
* a single SIMD register.
*/
-static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
- real *v0, real *v1, const int *offset_jj)
+static inline void
+add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
{
- gmx_mm_hpr e_SSE0, e_SSE1;
+ gmx_mm_hpr e_S0, e_S1;
int jj;
- e_SSE0 = _mm256_extractf128_ps(e_SSE, 0);
- e_SSE1 = _mm256_extractf128_ps(e_SSE, 1);
+ gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
for (jj = 0; jj < (UNROLLJ/2); jj++)
{
- gmx_mm_hpr v_SSE;
+ gmx_mm_hpr v_S;
- gmx_load_hpr(v_SSE, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE0));
+ gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
}
for (jj = 0; jj < (UNROLLJ/2); jj++)
{
- gmx_mm_hpr v_SSE;
+ gmx_mm_hpr v_S;
- gmx_load_hpr(v_SSE, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_SSE, e_SSE1));
+ gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
}
}
#endif
-#endif /* GMX_X86_SSE2 */
-
#endif /* _nbnxn_kernel_sse_utils_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_ref_h_
+#define _nbnxn_kernel_simd_utils_ref_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+
+#if GMX_SIMD_WIDTH_HERE > 4
+/* The 4xn kernel operates on 4-wide i-force registers */
+
+/* float/double SIMD register type */
+typedef struct {
+ real r[4];
+} gmx_mm_pr4;
+
+static gmx_inline gmx_mm_pr4
+gmx_load_pr4(const real *r)
+{
+ gmx_mm_pr4 a;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ a.r[i] = r[i];
+ }
+
+ return a;
+}
+
+static gmx_inline void
+gmx_store_pr4(real *dest, gmx_mm_pr4 src)
+{
+ gmx_mm_pr4 a;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ dest[i] = src.r[i];
+ }
+}
+
+static gmx_inline gmx_mm_pr4
+gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
+{
+ gmx_mm_pr4 c;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+
+ return c;
+}
+
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD_2XNN
+
+/* Half-width operations are required for the 2xnn kernels */
+
+/* Half-width SIMD real type */
+/* float/double SIMD register type */
+typedef struct {
+ real r[GMX_SIMD_WIDTH_HERE/2];
+} gmx_mm_hpr;
+
+/* Half-width SIMD operations */
+
+/* Load reals at half-width aligned pointer b into half-width SIMD register a */
+static gmx_inline void
+gmx_load_hpr(gmx_mm_hpr *a, const real *b)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ a->r[i] = b[i];
+ }
+}
+
+/* Set all entries in half-width SIMD register *a to b */
+static gmx_inline void
+gmx_set1_hpr(gmx_mm_hpr *a, real b)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ a->r[i] = b;
+ }
+}
+
+/* Load one real at b and one real at b+1 into halves of a, respectively */
+static gmx_inline void
+gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ a->r[ i] = b[0];
+ a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
+ }
+}
+
+/* Load reals at half-width aligned pointer b into two halves of a */
+static gmx_inline void
+gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ a->r[i] = b[i];
+ a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
+ }
+}
+
+/* Store half-width SIMD register b into half width aligned memory a */
+static gmx_inline void
+gmx_store_hpr(real *a, gmx_mm_hpr b)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ a[i] = b.r[i];
+ }
+}
+
+static gmx_inline gmx_mm_hpr
+gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
+{
+ gmx_mm_hpr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+
+ return c;
+}
+
+static gmx_inline gmx_mm_hpr
+gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
+{
+ gmx_mm_hpr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ c.r[i] = a.r[i] - b.r[i];
+ }
+
+ return c;
+}
+
+/* Sum over 4 half SIMD registers */
+static gmx_inline gmx_mm_hpr
+gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_mm_hpr c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ c.r[i] =
+ a.r[i] +
+ a.r[GMX_SIMD_WIDTH_HERE/2+i] +
+ b.r[i] +
+ b.r[GMX_SIMD_WIDTH_HERE/2+i];
+ }
+
+ return c;
+}
+
+/* Sum the elements of halfs of each input register and store sums in out */
+static gmx_inline gmx_mm_pr4
+gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+{
+ gmx_mm_pr4 sum;
+ int i;
+
+ sum.r[0] = 0;
+ sum.r[1] = 0;
+ sum.r[2] = 0;
+ sum.r[3] = 0;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ sum.r[0] += a.r[i];
+ sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
+ sum.r[2] += b.r[i];
+ sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
+ }
+
+ return sum;
+}
+
+static gmx_inline void
+gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ b->r[i] = a.r[i];
+ c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
+ }
+}
+static gmx_inline void
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ c->r[i] = a.r[i];
+ c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
+ }
+}
+
+#endif /* GMX_NBNXN_SIMD_2XNN */
+
+
+#ifndef TAB_FDV0
+static gmx_inline void
+load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S, int *ti,
+ gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
+ ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
+ }
+
+ *ctab1_S = gmx_sub_pr(*ctab1_S, *ctab0_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
+ gmx_simd_ref_epi32 ti_S, int *ti,
+ gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
+ gmx_simd_ref_pr *ctabv_S)
+{
+ int i;
+
+ load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
+ }
+}
+#endif
+
+#ifdef TAB_FDV0
+static gmx_inline void
+load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
+ gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
+ ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
+ }
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_FDV0,
+ gmx_simd_ref_epi32 ti_S, int *ti,
+ gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
+ gmx_simd_ref_pr *ctabv_S)
+{
+ int i;
+
+ load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
+ }
+}
+#endif
+
+/* Sum the elements within each input register and store the sums in out.
+ * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
+ */
+#if GMX_SIMD_WIDTH_HERE == 2
+static gmx_inline gmx_simd_ref_pr
+gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
+{
+ gmx_simd_ref_pr sum;
+
+ sum.r[0] = in0.r[0] + in0.r[1];
+ sum.r[1] = in1.r[0] + in1.r[1];
+
+ return sum;
+}
+#endif
+
+#if GMX_SIMD_WIDTH_HERE >= 4
+#if GMX_SIMD_WIDTH_HERE == 4
+static gmx_inline gmx_simd_ref_pr
+#else
+static gmx_inline gmx_mm_pr4
+#endif
+gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
+ gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
+{
+#if GMX_SIMD_WIDTH_HERE == 4
+ gmx_simd_ref_pr sum;
+#else
+ gmx_mm_pr4 sum;
+#endif
+ int i;
+
+ sum.r[0] = 0;
+ sum.r[1] = 0;
+ sum.r[2] = 0;
+ sum.r[3] = 0;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ sum.r[0] += in0.r[i];
+ sum.r[1] += in1.r[i];
+ sum.r[2] += in2.r[i];
+ sum.r[3] += in3.r[i];
+ }
+
+ return sum;
+}
+#endif
+
+#ifdef GMX_DOUBLE
+/* In double precision it can be faster to first calculate single precision
+ * square roots for two double precision registers at once and then use
+ * double precision Newton-Raphson iteration to reach full double precision.
+ * For this reference code we just use a plain-C sqrt.
+ */
+static gmx_inline void
+gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
+ gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
+{
+ out0 = gmx_invsqrt_pr(in0);
+ out1 = gmx_invsqrt_pr(in1);
+}
+#endif
+
+#ifdef NBFP_STRIDE
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
+ {
+ c6_S->r[i] = nbfp[type[aj+i]*NBFP_STRIDE];
+ c12_S->r[i] = nbfp[type[aj+i]*NBFP_STRIDE+1];
+ }
+}
+
+#ifdef GMX_NBNXN_SIMD_2XNN
+static gmx_inline void
+load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
+ const int *type, int aj,
+ gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
+ {
+ c6_S->r[i] = nbfp0[type[aj+i]*NBFP_STRIDE];
+ c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*NBFP_STRIDE];
+ c12_S->r[i] = nbfp0[type[aj+i]*NBFP_STRIDE+1];
+ c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*NBFP_STRIDE+1];
+ }
+}
+#endif
+#endif
+
+#endif /* _nbnxn_kernel_simd_utils_ref_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_x86_128d_h_
+#define _nbnxn_kernel_simd_utils_x86_128d_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+/* Transpose 2 double precision registers */
+static gmx_inline void
+gmx_mm_transpose2_op_pd(__m128d in0, __m128d in1,
+ __m128d *out0, __m128d *out1)
+{
+ *out0 = _mm_unpacklo_pd(in0, in1);
+ *out1 = _mm_unpackhi_pd(in0, in1);
+}
+
+/* Sum the elements within each input register and store the sums in out */
+static gmx_inline __m128d
+gmx_mm_transpose_sum2_pr(__m128d in0, __m128d in1)
+{
+ __m128d tr0, tr1;
+
+ gmx_mm_transpose2_op_pd(in0, in1, &tr0, &tr1);
+
+ return _mm_add_pd(tr0, tr1);
+}
+
+static inline __m128
+gmx_mm128_invsqrt_ps_single(__m128 x)
+{
+ const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
+ const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
+
+ __m128 lu = _mm_rsqrt_ps(x);
+
+ return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
+}
+
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+static gmx_inline void
+gmx_mm_invsqrt2_pd(__m128d in0, __m128d in1,
+ __m128d *out0, __m128d *out1)
+{
+ const __m128d half = _mm_set1_pd(0.5);
+ const __m128d three = _mm_set1_pd(3.0);
+ __m128 s, ir;
+ __m128d lu0, lu1;
+
+ s = _mm_movelh_ps(_mm_cvtpd_ps(in0), _mm_cvtpd_ps(in1));
+ ir = gmx_mm128_invsqrt_ps_single(s);
+ lu0 = _mm_cvtps_pd(ir);
+ lu1 = _mm_cvtps_pd(_mm_movehl_ps(ir, ir));
+ *out0 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu0, lu0), in0)), lu0));
+ *out1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), in1)), lu1));
+}
+
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ __m128d *c6_S, __m128d *c12_S)
+{
+ __m128d clj_S[UNROLLJ];
+ int p;
+
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);
+ }
+ gmx_mm_transpose2_op_pd(clj_S[0], clj_S[1], c6_S, c12_S);
+}
+
+/* The load_table functions below are performance critical.
+ * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
+ * As these all have latencies, scheduling is crucial.
+ * The Intel compilers and CPUs seem to do a good job at this.
+ * But AMD CPUs perform significantly worse with gcc than with icc.
+ * Performance is improved a bit by using the extract function UNROLLJ times,
+ * instead of doing an _mm_store_si128 for every i-particle.
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
+ * Because of this, the load_table_f macro always takes the ti parameter,
+ * but it is only used with AVX.
+ */
+
+static gmx_inline void
+load_table_f(const real *tab_coul_F, gmx_epi32 ti_S, int *ti,
+ __m128d *ctab0_S, __m128d *ctab1_S)
+{
+ int idx[2];
+ __m128d ctab_S[2];
+
+ /* Without SSE4.1 the extract macro needs an immediate: unroll */
+ idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ ctab_S[0] = _mm_loadu_pd(tab_coul_F+idx[0]);
+ idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ ctab_S[1] = _mm_loadu_pd(tab_coul_F+idx[1]);
+
+ /* Shuffle the force table entries to a convenient order */
+ gmx_mm_transpose2_op_pd(ctab_S[0], ctab_S[1], ctab0_S, ctab1_S);
+ /* The second force table entry should contain the difference */
+ *ctab1_S = _mm_sub_pd(*ctab1_S, *ctab0_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
+ gmx_epi32 ti_S, int *ti,
+ __m128d *ctab0_S, __m128d *ctab1_S, __m128d *ctabv_S)
+{
+ int idx[2];
+ __m128d ctab_S[4];
+
+ /* Without SSE4.1 the extract macro needs an immediate: unroll */
+ idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ ctab_S[0] = _mm_loadu_pd(tab_coul_F+idx[0]);
+ idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ ctab_S[1] = _mm_loadu_pd(tab_coul_F+idx[1]);
+
+ /* Shuffle the force table entries to a convenient order */
+ gmx_mm_transpose2_op_pd(ctab_S[0], ctab_S[1], ctab0_S, ctab1_S);
+ /* The second force table entry should contain the difference */
+ *ctab1_S = _mm_sub_pd(*ctab1_S, *ctab0_S);
+
+ ctab_S[2] = _mm_loadu_pd(tab_coul_V+idx[0]);
+ ctab_S[3] = _mm_loadu_pd(tab_coul_V+idx[1]);
+
+ /* Shuffle the energy table entries to a single register */
+ *ctabv_S = _mm_shuffle_pd(ctab_S[2], ctab_S[3], _MM_SHUFFLE2(0, 0));
+}
+
+#endif /* _nbnxn_kernel_simd_utils_x86_s128d_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_x86_128s_h_
+#define _nbnxn_kernel_simd_utils_x86_128s_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
+static gmx_inline void
+gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
+ __m128 *out0, __m128 *out1)
+{
+ __m128 _c01, _c23;
+
+ _c01 = _mm_movelh_ps(in0, in1);
+ _c23 = _mm_movelh_ps(in2, in3);
+ *out0 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));
+ *out1 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+/* Collect element 2 of the 4 inputs to out */
+static gmx_inline __m128
+gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
+{
+ __m128 _c01, _c23;
+
+ _c01 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 2, 3, 2));
+ _c23 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ return _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+/* Sum the elements within each input register and store the sums in out */
+static gmx_inline __m128
+gmx_mm_transpose_sum4_pr(__m128 in0, __m128 in1,
+ __m128 in2, __m128 in3)
+{
+ _MM_TRANSPOSE4_PS(in0, in1, in2, in3);
+ in0 = _mm_add_ps(in0, in1);
+ in2 = _mm_add_ps(in2, in3);
+
+ return _mm_add_ps(in0, in2);
+}
+
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ __m128 *c6_S, __m128 *c12_S)
+{
+ __m128 clj_S[UNROLLJ];
+ int p;
+
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ /* Here we load 4 aligned floats, but we need just 2 */
+ clj_S[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);
+ }
+ gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3], c6_S, c12_S);
+}
+
+/* The load_table functions below are performance critical.
+ * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
+ * As these all have latencies, scheduling is crucial.
+ * The Intel compilers and CPUs seem to do a good job at this.
+ * But AMD CPUs perform significantly worse with gcc than with icc.
+ * Performance is improved a bit by using the extract function UNROLLJ times,
+ * instead of doing an _mm_store_si128 for every i-particle.
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
+ * Because of this, the load_table_f macro always takes the ti parameter,
+ * but it is only used with AVX.
+ */
+
+static gmx_inline void
+load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+ __m128 *ctab0_S, __m128 *ctab1_S)
+{
+ int idx[4];
+ __m128 ctab_S[4];
+
+ /* Table has 4 entries, left-shift index by 2 */
+ ti_S = _mm_slli_epi32(ti_S, 2);
+ /* Without SSE4.1 the extract macro needs an immediate: unroll */
+ idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
+ idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
+ idx[2] = gmx_mm_extract_epi32(ti_S, 2);
+ ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
+ idx[3] = gmx_mm_extract_epi32(ti_S, 3);
+ ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
+
+ /* Shuffle the force table entries to a convenient order */
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3], ctab0_S, ctab1_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+ __m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S)
+{
+ int idx[4];
+ __m128 ctab_S[4];
+
+ /* Table has 4 entries, left-shift index by 2 */
+ ti_S = _mm_slli_epi32(ti_S, 2);
+ /* Without SSE4.1 the extract macro needs an immediate: unroll */
+ idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
+ idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
+ idx[2] = gmx_mm_extract_epi32(ti_S, 2);
+ ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
+ idx[3] = gmx_mm_extract_epi32(ti_S, 3);
+ ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
+
+ /* Shuffle the force table entries to a convenient order */
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3], ctab0_S, ctab1_S);
+
+ *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3]);
+}
+
+#endif /* _nbnxn_kernel_simd_utils_x86_s128s_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_x86_256d_h_
+#define _nbnxn_kernel_simd_utils_x86_256d_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+/* Transpose 2 double precision registers */
+static gmx_inline void
+gmx_mm_transpose2_op_pd(__m128d in0, __m128d in1,
+ __m128d *out0, __m128d *out1)
+{
+ *out0 = _mm_unpacklo_pd(in0, in1);
+ *out1 = _mm_unpackhi_pd(in0, in1);
+}
+
+/* Sum the elements within each input register and store the sums in out */
+static gmx_inline __m256d
+gmx_mm_transpose_sum4_pr(__m256d in0, __m256d in1,
+ __m256d in2, __m256d in3)
+{
+ in0 = _mm256_hadd_pd(in0, in1);
+ in2 = _mm256_hadd_pd(in2, in3);
+
+ return _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31));
+}
+
+static gmx_inline __m256
+gmx_mm256_invsqrt_ps_single(__m256 x)
+{
+ const __m256 half = _mm256_set_ps(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
+ const __m256 three = _mm256_set_ps(3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0);
+
+ __m256 lu = _mm256_rsqrt_ps(x);
+
+ return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
+}
+
+/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
+static gmx_inline __m256
+gmx_2_m128_to_m256(__m128 in0, __m128 in1)
+{
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);
+}
+
+/* Put two 128-bit 2-double registers into one 256-bit 4-double register */
+static gmx_inline __m256d
+gmx_2_m128d_to_m256d(__m128d in0, __m128d in1)
+{
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(in0), in1, 1);
+}
+
+/* Do 2 double precision invsqrt operations.
+ * Doing the SIMD rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ */
+static gmx_inline void
+gmx_mm_invsqrt2_pd(__m256d in0, __m256d in1,
+ __m256d *out0, __m256d *out1)
+{
+ const __m256d half = _mm256_set1_pd(0.5);
+ const __m256d three = _mm256_set1_pd(3.0);
+ __m256 s, ir;
+ __m256d lu0, lu1;
+
+ s = gmx_2_m128_to_m256(_mm256_cvtpd_ps(in0), _mm256_cvtpd_ps(in1));
+ ir = gmx_mm256_invsqrt_ps_single(s);
+ lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));
+ lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1));
+ *out0 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu0, lu0), in0)), lu0));
+ *out1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), in1)), lu1));
+}
+
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ __m256d *c6_S, __m256d *c12_S)
+{
+ __m128d clj_S[UNROLLJ], c6t_S[2], c12t_S[2];
+ int p;
+
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);
+ }
+ gmx_mm_transpose2_op_pd(clj_S[0], clj_S[1], &c6t_S[0], &c12t_S[0]);
+ gmx_mm_transpose2_op_pd(clj_S[2], clj_S[3], &c6t_S[1], &c12t_S[1]);
+ *c6_S = gmx_2_m128d_to_m256d(c6t_S[0], c6t_S[1]);
+ *c12_S = gmx_2_m128d_to_m256d(c12t_S[0], c12t_S[1]);
+}
+
+static gmx_inline void
+load_table_f(const real *tab_coul_F, __m128i ti_S, int *ti,
+ __m256d *ctab0_S, __m256d *ctab1_S)
+{
+ __m128d ctab_S[4], tr_S[4];
+ int j;
+
+ _mm_store_si128((__m128i *)ti, ti_S);
+ for (j = 0; j < 4; j++)
+ {
+ ctab_S[j] = _mm_loadu_pd(tab_coul_F+ti[j]);
+ }
+ /* Shuffle the force table entries to a convenient order */
+ gmx_mm_transpose2_op_pd(ctab_S[0], ctab_S[1], &tr_S[0], &tr_S[1]);
+ gmx_mm_transpose2_op_pd(ctab_S[2], ctab_S[3], &tr_S[2], &tr_S[3]);
+ *ctab0_S = gmx_2_m128d_to_m256d(tr_S[0], tr_S[2]);
+ *ctab1_S = gmx_2_m128d_to_m256d(tr_S[1], tr_S[3]);
+ /* The second force table entry should contain the difference */
+ *ctab1_S = _mm256_sub_pd(*ctab1_S, *ctab0_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
+ __m128i ti_S, int *ti,
+ __m256d *ctab0_S, __m256d *ctab1_S, __m256d *ctabv_S)
+{
+ __m128d ctab_S[8], tr_S[4];
+ int j;
+
+ _mm_store_si128((__m128i *)ti, ti_S);
+ for (j = 0; j < 4; j++)
+ {
+ ctab_S[j] = _mm_loadu_pd(tab_coul_F+ti[j]);
+ }
+ /* Shuffle the force table entries to a convenient order */
+ gmx_mm_transpose2_op_pd(ctab_S[0], ctab_S[1], &tr_S[0], &tr_S[1]);
+ gmx_mm_transpose2_op_pd(ctab_S[2], ctab_S[3], &tr_S[2], &tr_S[3]);
+ *ctab0_S = gmx_2_m128d_to_m256d(tr_S[0], tr_S[2]);
+ *ctab1_S = gmx_2_m128d_to_m256d(tr_S[1], tr_S[3]);
+ /* The second force table entry should contain the difference */
+ *ctab1_S = _mm256_sub_pd(*ctab1_S, *ctab0_S);
+
+ for (j = 0; j < 4; j++)
+ {
+ ctab_S[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]);
+ }
+ /* Shuffle the energy table entries to a single register */
+ *ctabv_S = gmx_2_m128d_to_m256d(_mm_shuffle_pd(ctab_S[4], ctab_S[5], _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(ctab_S[6], ctab_S[7], _MM_SHUFFLE2(0, 0)));
+}
+
+#endif /* _nbnxn_kernel_simd_utils_x86_s256d_h_ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_x86_256s_h_
+#define _nbnxn_kernel_simd_utils_x86_256s_h_
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+
+/* The 4xn kernel operates on 4-wide i-force registers */
+#define gmx_mm_pr4 __m128
+#define gmx_load_pr4 _mm_load_ps
+#define gmx_store_pr4 _mm_store_ps
+#define gmx_add_pr4 _mm_add_ps
+
+
+/* Half-width operations are required for the 2xnn kernels */
+
+/* Half-width SIMD real type */
+#define gmx_mm_hpr __m128
+
+/* Half-width SIMD operations */
+/* Load reals at half-width aligned pointer b into half-width SIMD register a */
+#define gmx_load_hpr(a, b) *(a) = _mm_load_ps(b)
+/* Set all entries in half-width SIMD register *a to b */
+#define gmx_set1_hpr(a, b) *(a) = _mm_set1_ps(b)
+/* Load one real at b and one real at b+1 into halves of a, respectively */
+#define gmx_load1p1_pr(a, b) *(a) = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
+/* Load reals at half-width aligned pointer b into two halves of a */
+#define gmx_loaddh_pr(a, b) *(a) = gmx_mm256_load4_ps(b)
+/* To half-width SIMD register b into half width aligned memory a */
+#define gmx_store_hpr(a, b) _mm_store_ps(a, b)
+#define gmx_add_hpr _mm_add_ps
+#define gmx_sub_hpr _mm_sub_ps
+/* Sum over 4 half SIMD registers */
+#define gmx_sum4_hpr gmx_mm256_sum4h_m128
+
+static gmx_inline void
+gmx_pr_to_2hpr(gmx_mm_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+{
+ *b = _mm256_extractf128_ps(a, 0);
+ *c = _mm256_extractf128_ps(a, 1);
+}
+
+/* Store half width SIMD registers a and b in full width register *c */
+static gmx_inline void
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_pr *c)
+{
+ *c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
+}
+
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
+static gmx_inline void
+gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
+ __m128 *out0, __m128 *out1)
+{
+ __m128 _c01, _c23;
+
+ _c01 = _mm_movelh_ps(in0, in1);
+ _c23 = _mm_movelh_ps(in2, in3);
+ *out0 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));
+ *out1 = _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+/* Collect element 2 of the 4 inputs to out */
+static gmx_inline __m128
+gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
+{
+ __m128 _c01, _c23;
+
+ _c01 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 2, 3, 2));
+ _c23 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ return _mm_shuffle_ps(_c01, _c23, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+/* Sum the elements within each input register and return the sums */
+static gmx_inline __m128
+gmx_mm_transpose_sum4_pr(__m256 in0, __m256 in1,
+ __m256 in2, __m256 in3)
+{
+ in0 = _mm256_hadd_ps(in0, in1);
+ in2 = _mm256_hadd_ps(in2, in3);
+ in1 = _mm256_hadd_ps(in0, in2);
+
+ return _mm_add_ps(_mm256_castps256_ps128(in1),
+ _mm256_extractf128_ps(in1, 1));
+}
+
+/* Sum the elements of halfs of each input register and return the sums */
+static gmx_inline __m128
+gmx_mm_transpose_sum4h_pr(__m256 in0, __m256 in2)
+{
+ in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps());
+ in2 = _mm256_hadd_ps(in2, _mm256_setzero_ps());
+ in0 = _mm256_hadd_ps(in0, in2);
+ in2 = _mm256_permute_ps(in0, _MM_SHUFFLE(2, 3, 0, 1));
+
+ return _mm_add_ps(_mm256_castps256_ps128(in0), _mm256_extractf128_ps(in2, 1));
+}
+
+/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
+static gmx_inline __m256
+gmx_2_mm_to_m256(__m128 in0, __m128 in1)
+{
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);
+}
+
+#if UNROLLJ == 8
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ __m256 *c6_S, __m256 *c12_S)
+{
+ __m128 clj_S[UNROLLJ], c6t_S[2], c12t_S[2];
+ int p;
+
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ /* Here we load 4 aligned floats, but we need just 2 */
+ clj_S[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);
+ }
+ gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3],
+ &c6t_S[0], &c12t_S[0]);
+ gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[4], clj_S[5], clj_S[6], clj_S[7],
+ &c6t_S[1], &c12t_S[1]);
+
+ *c6_S = gmx_2_mm_to_m256(c6t_S[0], c6t_S[1]);
+ *c12_S = gmx_2_mm_to_m256(c12t_S[0], c12t_S[1]);
+}
+#endif
+
+#if UNROLLJ == 4
+static gmx_inline void
+load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
+ const int *type, int aj,
+ __m256 *c6_S, __m256 *c12_S)
+{
+ __m128 clj_S0[UNROLLJ], clj_S1[UNROLLJ], c6t_S[2], c12t_S[2];
+ int p;
+
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ /* Here we load 4 aligned floats, but we need just 2 */
+ clj_S0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE);
+ }
+ for (p = 0; p < UNROLLJ; p++)
+ {
+ /* Here we load 4 aligned floats, but we need just 2 */
+ clj_S1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE);
+ }
+ gmx_shuffle_4_ps_fil01_to_2_ps(clj_S0[0], clj_S0[1], clj_S0[2], clj_S0[3],
+ &c6t_S[0], &c12t_S[0]);
+ gmx_shuffle_4_ps_fil01_to_2_ps(clj_S1[0], clj_S1[1], clj_S1[2], clj_S1[3],
+ &c6t_S[1], &c12t_S[1]);
+
+ *c6_S = gmx_2_mm_to_m256(c6t_S[0], c6t_S[1]);
+ *c12_S = gmx_2_mm_to_m256(c12t_S[0], c12t_S[1]);
+}
+#endif
+
+
+/* The load_table functions below are performance critical.
+ * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
+ * As these all have latencies, scheduling is crucial.
+ * The Intel compilers and CPUs seem to do a good job at this.
+ * But AMD CPUs perform significantly worse with gcc than with icc.
+ * Performance is improved a bit by using the extract function UNROLLJ times,
+ * instead of doing an _mm_store_si128 for every i-particle.
+ * This is only faster when we use FDV0 formatted tables, where we also need
+ * to multiple the index by 4, which can be done by a SIMD bit shift.
+ * With single precision AVX, 8 extracts are much slower than 1 store.
+ * Because of this, the load_table_f macro always takes the ti parameter,
+ * but it is only used with AVX.
+ */
+
+static gmx_inline void
+load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+ __m256 *ctab0_S, __m256 *ctab1_S)
+{
+ __m128 ctab_S[8], ctabt_S[4];
+ int j;
+
+ /* Bit shifting would be faster, but AVX doesn't support that */
+ _mm256_store_si256((__m256i *)ti, ti_S);
+ for (j = 0; j < 8; j++)
+ {
+ ctab_S[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);
+ }
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3],
+ &ctabt_S[0], &ctabt_S[2]);
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[4], ctab_S[5], ctab_S[6], ctab_S[7],
+ &ctabt_S[1], &ctabt_S[3]);
+
+ *ctab0_S = gmx_2_mm_to_m256(ctabt_S[0], ctabt_S[1]);
+ *ctab1_S = gmx_2_mm_to_m256(ctabt_S[2], ctabt_S[3]);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+ __m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S)
+{
+ __m128 ctab_S[8], ctabt_S[4], ctabvt_S[2];
+ int j;
+
+ /* Bit shifting would be faster, but AVX doesn't support that */
+ _mm256_store_si256((__m256i *)ti, ti_S);
+ for (j = 0; j < 8; j++)
+ {
+ ctab_S[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);
+ }
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3],
+ &ctabt_S[0], &ctabt_S[2]);
+ gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[4], ctab_S[5], ctab_S[6], ctab_S[7],
+ &ctabt_S[1], &ctabt_S[3]);
+
+ *ctab0_S = gmx_2_mm_to_m256(ctabt_S[0], ctabt_S[1]);
+ *ctab1_S = gmx_2_mm_to_m256(ctabt_S[2], ctabt_S[3]);
+
+ ctabvt_S[0] = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1],
+ ctab_S[2], ctab_S[3]);
+ ctabvt_S[1] = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[4], ctab_S[5],
+ ctab_S[6], ctab_S[7]);
+
+ *ctabv_S = gmx_2_mm_to_m256(ctabvt_S[0], ctabvt_S[1]);
+}
+
+#endif /* _nbnxn_kernel_simd_utils_x86_s256s_h_ */
#include "vec.h"
#include "pbc.h"
#include "nbnxn_consts.h"
+/* nbnxn_internal.h included gmx_simd_macros.h */
#include "nbnxn_internal.h"
+#ifdef GMX_NBNXN_SIMD
+#include "gmx_simd_vec.h"
+#endif
#include "nbnxn_atomdata.h"
#include "nbnxn_search.h"
#include "gmx_cyclecounter.h"
#define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
/* The j-cluster size is matched to the SIMD width */
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#ifdef GMX_DOUBLE
+#if GMX_SIMD_WIDTH_HERE == 2
#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
#else
-#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
-#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
-#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
-#endif
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#ifdef GMX_DOUBLE
+#if GMX_SIMD_WIDTH_HERE == 4
#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
#else
+#if GMX_SIMD_WIDTH_HERE == 8
#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
-#endif
+#else
+#if GMX_SIMD_WIDTH_HERE == 16
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
#else
#error "unsupported GMX_NBNXN_SIMD_WIDTH"
#endif
#endif
+#endif
+#endif
#endif /* GMX_NBNXN_SIMD */
-/* Interaction masks for 4xN atom interactions.
- * Bit i*CJ_SIZE + j tells if atom i and j interact.
- */
-/* All interaction mask is the same for all kernels */
-#define NBNXN_INT_MASK_ALL 0xffffffff
-/* 4x4 kernel diagonal mask */
-#define NBNXN_INT_MASK_DIAG 0x08ce
-/* 4x2 kernel diagonal masks */
-#define NBNXN_INT_MASK_DIAG_J2_0 0x0002
-#define NBNXN_INT_MASK_DIAG_J2_1 0x002F
-/* 4x8 kernel diagonal masks */
-#define NBNXN_INT_MASK_DIAG_J8_0 0xf0f8fcfe
-#define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
-
-
#ifdef NBNXN_SEARCH_BB_SSE
/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
#define NBNXN_BBXXXX
int cj_size = 0;
#ifdef GMX_NBNXN_SIMD
- nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ nbnxn_simd_width = GMX_SIMD_WIDTH_HERE;
#endif
switch (nb_kernel_type)
bb[BBU_Z] = R2F_U(zh);
}
-#ifdef NBNXN_SEARCH_BB_SSE
-
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na, const real *x,
float *bb, float *bbj)
{
+#ifndef NBNXN_SEARCH_BB_SSE
+ int i;
+#endif
+
calc_bounding_box_x_x4(min(na, 2), x, bbj);
if (na > 2)
/* Set the "empty" bounding box to the same as the first one,
* so we don't need to treat special cases in the rest of the code.
*/
+#ifdef NBNXN_SEARCH_BB_SSE
_mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
_mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
+#else
+ for (i = 0; i < NNBSBB_B; i++)
+ {
+ bbj[NNBSBB_B + i] = bbj[i];
+ }
+#endif
}
+#ifdef NBNXN_SEARCH_BB_SSE
_mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
_mm_load_ps(bbj+NNBSBB_B)));
_mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
_mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
+#else
+ for (i = 0; i < NNBSBB_C; i++)
+ {
+ bb[ i] = min(bbj[ i], bbj[NNBSBB_B + i]);
+ bb[NNBSBB_C + i] = max(bbj[NNBSBB_C + i], bbj[NNBSBB_B + NNBSBB_C + i]);
+ }
+#endif
}
+#ifdef NBNXN_SEARCH_BB_SSE
+
/* Coordinate order xyz, bb order xxxxyyyyzzzz */
static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
{
#endif /* NBNXN_SEARCH_SSE_SINGLE */
-#ifdef NBNXN_SEARCH_BB_SSE
/* Combines pairs of consecutive bounding boxes */
static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
{
int i, j, sc2, nc2, c2;
- __m128 min_SSE, max_SSE;
for (i = 0; i < grid->ncx*grid->ncy; i++)
{
nc2 = (grid->cxy_na[i]+3)>>(2+1);
for (c2 = sc2; c2 < sc2+nc2; c2++)
{
+#ifdef NBNXN_SEARCH_BB_SSE
+ __m128 min_SSE, max_SSE;
+
min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
_mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
_mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
_mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
_mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
+#else
+ for (j = 0; j < NNBSBB_C; j++)
+ {
+ grid->bbj[(c2*2+0)*NNBSBB_C+j] = min(bb[(c2*4+0)*NNBSBB_C+j],
+ bb[(c2*4+2)*NNBSBB_C+j]);
+ grid->bbj[(c2*2+1)*NNBSBB_C+j] = max(bb[(c2*4+1)*NNBSBB_C+j],
+ bb[(c2*4+3)*NNBSBB_C+j]);
+ }
+#endif
}
if (((grid->cxy_na[i]+3)>>2) & 1)
{
}
}
-#endif
-
/* Prints the average bb size, used for debug output */
static void print_bbsizes_simple(FILE *fp,
offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
bb_ptr = grid->bb + offset;
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
+#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
if (2*grid->na_cj == grid->na_c)
{
calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
}
}
-#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid, grid->bb);
}
-#endif
if (!grid->bSimple)
{
}
}
-#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid, grid->bb_simple);
}
-#endif
}
void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
-static float subc_bb_dist2_sse(int na_c,
- int si, const float *bb_i_ci,
+static float subc_bb_dist2_sse(int si, const float *bb_i_ci,
int csj, const float *bb_j_all)
{
const float *bb_i, *bb_j;
return FALSE;
}
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+/* When we make seperate single/double precision SIMD vector operation
+ * include files, this function should be moved there (also using FMA).
+ */
+static inline __m128
+gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
+{
+ return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
+}
+#endif
+
/* SSE function which determines if any atom pair between two cells,
* both with 8 atoms, is within distance sqrt(rl2).
+ * Not performance critical, so only uses plain SSE.
*/
static gmx_bool subc_in_range_sse8(int na_c,
int si, const real *x_i,
for (t = 0; t < WARP_SIZE; t++)
{
/* Turn all interaction bits on */
- excl->pair[t] = NBNXN_INT_MASK_ALL;
+ excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
}
}
j = nbl->ci[i].cj_ind_start;
while (j < nbl->ci[i].cj_ind_end &&
- nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
+ nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
{
npexcl++;
j++;
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
{
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
+ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
}
-/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
-static unsigned int get_imask_simd128(gmx_bool rdiag, int ci, int cj)
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
{
-#ifndef GMX_DOUBLE /* cj-size = 4 */
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
-#else /* cj-size = 2 */
- return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
- (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
- NBNXN_INT_MASK_ALL));
-#endif
+ return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
+ (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
+ NBNXN_INTERACTION_MASK_ALL));
}
-/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
-static unsigned int get_imask_simd256(gmx_bool rdiag, int ci, int cj)
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
{
-#ifndef GMX_DOUBLE /* cj-size = 8 */
- return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
- (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
- NBNXN_INT_MASK_ALL));
-#else /* cj-size = 4 */
- return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
-#endif
+ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
+{
+ return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
+ (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
+ NBNXN_INTERACTION_MASK_ALL));
}
#ifdef GMX_NBNXN_SIMD
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define get_imask_simd_4xn get_imask_simd128
-#else
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define get_imask_simd_4xn get_imask_simd256
-#define get_imask_simd_2xnn get_imask_simd128
-#else
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#if GMX_SIMD_WIDTH_HERE == 2
+#define get_imask_simd_4xn get_imask_simd_j2
+#endif
+#if GMX_SIMD_WIDTH_HERE == 4
+#define get_imask_simd_4xn get_imask_simd_j4
+#endif
+#if GMX_SIMD_WIDTH_HERE == 8
+#define get_imask_simd_4xn get_imask_simd_j8
+#define get_imask_simd_2xnn get_imask_simd_j4
#endif
+#if GMX_SIMD_WIDTH_HERE == 16
+#define get_imask_simd_2xnn get_imask_simd_j8
#endif
#endif
jnew = 0;
for (j = 0; j < ncj; j++)
{
- if (cj[j].excl != NBNXN_INT_MASK_ALL)
+ if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
{
work->cj[jnew++] = cj[j];
}
}
/* Check if there are exclusions at all or not just the first entry */
if (!((jnew == 0) ||
- (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
+ (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
{
for (j = 0; j < ncj; j++)
{
- if (cj[j].excl == NBNXN_INT_MASK_ALL)
+ if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
{
work->cj[jnew++] = cj[j];
}
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#if GMX_NBNXN_SIMD_BITWIDTH != 256
-#error "unsupported SIMD width"
-#endif
-
-#include "gmx_simd_macros.h"
-
-/* Define a few macros for half-width SIMD */
-#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
-/* Half-width SIMD real type */
-#define gmx_mm_hpr __m128
-/* Half-width SIMD operations */
-/* Load reals at half-width aligned pointer b into half-width SIMD register a */
-#define gmx_load_hpr(a,b) a = _mm_load_ps(b)
-#define gmx_set1_hpr _mm_set1_ps
-/* Load reals at half-width aligned pointer b into two halves of a */
-#define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
-/* Store half width SIMD registers b and c in ful width register a */
-#define gmx_2hpr_to_pr(a, b, c) a = _mm256_insertf128_ps(_mm256_castps128_ps256(b), c, 0x1)
-#else
-#error "Half-width SIMD macros are not yet defined"
-#endif
+/* Get the half-width SIMD stuff from the kernel utils files */
+#include "nbnxn_kernels/nbnxn_kernel_simd_utils.h"
#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
gmx_mm_hpr a_S;
gmx_mm_pr a_a_S;
- gmx_load_hpr(a_S, a);
+ gmx_load_hpr(&a_S, a);
- gmx_2hpr_to_pr(a_a_S, a_S, a_S);
+ gmx_2hpr_to_pr(a_S, a_S, &a_a_S);
return a_a_S;
}
gmx_mm_hpr a0_S, a1_S;
gmx_mm_pr a0_a1_S;
- a0_S = gmx_set1_hpr(a[0] + shift);
- a1_S = gmx_set1_hpr(a[1] + shift);
+ gmx_set1_hpr(&a0_S, a[0] + shift);
+ gmx_set1_hpr(&a1_S, a[1] + shift);
- gmx_2hpr_to_pr(a0_a1_S, a0_S, a1_S);
+ gmx_2hpr_to_pr(a0_S, a1_S, &a0_a1_S);
return a0_a1_S;
}
x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
}
-#ifndef GMX_HAVE_SIMD_ANYTRUE
+#ifndef GMX_SIMD_HAVE_ANYTRUE
/* Fallback function in case gmx_anytrue_pr is not present */
static gmx_inline gmx_bool
-gmx_anytrue_2xn_pr(gmx_mm_pr bool_S)
+gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
{
real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
gmx_bool any;
bools = gmx_simd_align_real(bools_array);
- gmx_store_pr(bools, bool_S);
+ gmx_store_pb(bools, bool_S);
any = FALSE;
for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
gmx_mm_pr rsq_SSE0;
gmx_mm_pr rsq_SSE2;
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE2;
- gmx_mm_pr wco_any_SSE;
+ gmx_mm_pb wco_SSE0;
+ gmx_mm_pb wco_SSE2;
+ gmx_mm_pb wco_any_SSE;
gmx_mm_pr rc2_SSE;
InRange = FALSE;
while (!InRange && cjf <= cjl)
{
- d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SSE
+ d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#else
+ d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
+#endif
*ndistc += 2;
/* Check if the distance is within the distance where
wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
- wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
+ wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
-#ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
+#ifdef GMX_SIMD_HAVE_ANYTRUE
+ InRange = gmx_anytrue_pb(wco_any_SSE);
#else
- InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
+ InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
#endif
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
InRange = FALSE;
while (!InRange && cjl > cjf)
{
- d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SSE
+ d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#else
+ d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
+#endif
*ndistc += 2;
/* Check if the distance is within the distance where
wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
- wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
+ wco_any_SSE = gmx_or_pb(wco_SSE0, wco_SSE2);
-#ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
+#ifdef GMX_SIMD_HAVE_ANYTRUE
+ InRange = gmx_anytrue_pb(wco_any_SSE);
#else
- InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
+ InRange = gmx_anytrue_2xn_pb(wco_any_SSE);
#endif
*ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
#undef STRIDE_S
-
-#undef gmx_mm_hpr
-#undef gmx_load_hpr
-#undef gmx_set1_hpr
-#undef gmx_2hpr_to_pr
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
-#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
-#endif
-
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gmx_simd_macros.h"
#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
#define STRIDE_S (GMX_SIMD_WIDTH_HERE)
x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
}
-#ifndef GMX_HAVE_SIMD_ANYTRUE
+#ifndef GMX_SIMD_HAVE_ANYTRUE
/* Fallback function in case gmx_anytrue_pr is not present */
static gmx_inline gmx_bool
-gmx_anytrue_4xn_pr(gmx_mm_pr bool_S)
+gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
{
real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
gmx_bool any;
bools = gmx_simd_align_real(bools_array);
- gmx_store_pr(bools, bool_S);
+ gmx_store_pb(bools, bool_S);
any = FALSE;
for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
gmx_mm_pr rsq_SSE2;
gmx_mm_pr rsq_SSE3;
- gmx_mm_pr wco_SSE0;
- gmx_mm_pr wco_SSE1;
- gmx_mm_pr wco_SSE2;
- gmx_mm_pr wco_SSE3;
- gmx_mm_pr wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
+ gmx_mm_pb wco_SSE0;
+ gmx_mm_pb wco_SSE1;
+ gmx_mm_pb wco_SSE2;
+ gmx_mm_pb wco_SSE3;
+ gmx_mm_pb wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
gmx_mm_pr rc2_SSE;
InRange = FALSE;
while (!InRange && cjf <= cjl)
{
- d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjf, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SSE
+ d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
+#else
+ d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
+#endif
*ndistc += 2;
/* Check if the distance is within the distance where
wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
- wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
+ wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
+ wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
+ wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-#ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
+#ifdef GMX_SIMD_HAVE_ANYTRUE
+ InRange = gmx_anytrue_pb(wco_any_SSE);
#else
- InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
+ InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
#endif
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
InRange = FALSE;
while (!InRange && cjl > cjf)
{
- d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjl, gridj->bbj);
+#ifdef NBNXN_SEARCH_BB_SSE
+ d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
+#else
+ d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
+#endif
*ndistc += 2;
/* Check if the distance is within the distance where
wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
- wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
- wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
- wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
+ wco_any_SSE01 = gmx_or_pb(wco_SSE0, wco_SSE1);
+ wco_any_SSE23 = gmx_or_pb(wco_SSE2, wco_SSE3);
+ wco_any_SSE = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
-#ifdef GMX_HAVE_SIMD_ANYTRUE
- InRange = gmx_anytrue_pr(wco_any_SSE);
+#ifdef GMX_SIMD_HAVE_ANYTRUE
+ InRange = gmx_anytrue_pb(wco_any_SSE);
#else
- InRange = gmx_anytrue_4xn_pr(wco_any_SSE);
+ InRange = gmx_anytrue_4xn_pb(wco_any_SSE);
#endif
*ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
#undef STRIDE_S
-#undef GMX_USE_HALF_WIDTH_SIMD_HERE
+
#include "gmx_cyclecounter.h"
#include "gmx_omp.h"
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-#include "gmx_x86_simd_single.h"
+/* Include the SIMD macro file and then check for support */
+#include "gmx_simd_macros.h"
+#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
+/* Turn on SIMD intrinsics for PME solve */
+#define PME_SIMD
+#endif
-#define PME_SSE
+/* SIMD spread+gather only in single precision with SSE2 or higher available.
+ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
+ * complicated, as we use unaligned and/or 4-wide only loads.
+ */
+#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+#define PME_SSE_SPREAD_GATHER
+#include <emmintrin.h>
/* Some old AMD processors could have problems with unaligned loads+stores */
#ifndef GMX_FAHCORE
#define PME_SSE_UNALIGNED
typedef struct {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
/* Masks for SSE aligned spreading and gathering */
__m128 mask_SSE0[6], mask_SSE1[6];
#else
switch (order)
{
case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifdef PME_SSE_UNALIGNED
#define PME_SPREAD_SSE_ORDER4
#else
#endif
break;
case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#define PME_SPREAD_SSE_ALIGNED
#define PME_ORDER 5
#include "pme_sse_single.h"
static void set_grid_alignment(int *pmegrid_nz, int pme_order)
{
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
if (pme_order == 5
#ifndef PME_SSE_UNALIGNED
|| pme_order == 4
static void set_gridsize_alignment(int *gridsize, int pme_order)
{
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifndef PME_SSE_UNALIGNED
if (pme_order == 4)
{
srenew(work->mhy, work->nalloc);
srenew(work->mhz, work->nalloc);
srenew(work->m2, work->nalloc);
- /* Allocate an aligned pointer for SSE operations, including 3 extra
- * elements at the end since SSE operates on 4 elements at a time.
+ /* Allocate an aligned pointer for SIMD operations, including extra
+ * elements at the end for padding.
*/
+#ifdef PME_SIMD
+#define ALIGN_HERE GMX_SIMD_WIDTH_HERE
+#else
+/* We can use any alignment, apart from 0, so we use 4 */
+#define ALIGN_HERE 4
+#endif
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
sfree_aligned(work->eterm);
- snew_aligned(work->denom, work->nalloc+3, 16);
- snew_aligned(work->tmp1, work->nalloc+3, 16);
- snew_aligned(work->eterm, work->nalloc+3, 16);
+ snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ snew_aligned(work->tmp1, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
srenew(work->m2inv, work->nalloc);
}
}
}
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
+#ifdef PME_SIMD
+/* Calculate exponentials through SIMD */
inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
- __m128 f_sse;
- __m128 lu;
- __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+ const gmx_mm_pr two = gmx_set1_pr(2.0);
+ gmx_mm_pr f_simd;
+ gmx_mm_pr lu;
+ gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_sse = _mm_load1_ps(&f);
- for (kx = 0; kx < end; kx += 4)
+ f_simd = gmx_load1_pr(&f);
+ for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
{
- tmp_d1 = _mm_load_ps(d_aligned+kx);
- lu = _mm_rcp_ps(tmp_d1);
- d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
- tmp_r = _mm_load_ps(r_aligned+kx);
- tmp_r = gmx_mm_exp_ps(tmp_r);
- tmp_e = _mm_mul_ps(f_sse, d_inv);
- tmp_e = _mm_mul_ps(tmp_e, tmp_r);
- _mm_store_ps(e_aligned+kx, tmp_e);
+ tmp_d1 = gmx_load_pr(d_aligned+kx);
+ d_inv = gmx_inv_pr(tmp_d1);
+ tmp_r = gmx_load_pr(r_aligned+kx);
+ tmp_r = gmx_exp_pr(tmp_r);
+ tmp_e = gmx_mul_pr(f_simd, d_inv);
+ tmp_e = gmx_mul_pr(tmp_e, tmp_r);
+ gmx_store_pr(e_aligned+kx, tmp_e);
}
}
}
switch (order)
{
case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifdef PME_SSE_UNALIGNED
#define PME_GATHER_F_SSE_ORDER4
#else
#endif
break;
case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#define PME_GATHER_F_SSE_ALIGNED
#define PME_ORDER 5
#include "pme_sse_single.h"
{
pme_spline_work_t *work;
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
float tmp[8];
__m128 zero_SSE;
int of, i;