#define gmx_store_pb gmx_simd_ref_store_pb
#endif
-/* For topology exclusion pair checking we need: ((a & b) ? True : False)
- * when we do a bit-wise and between a and b.
- * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
- * Otherwise we do all operations, except for the set1, in reals.
- */
-
-#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#define gmx_set1_epi32 gmx_simd_ref_set1_epi32
-#define gmx_load_si gmx_simd_ref_load_si
-#define gmx_checkbitmask_epi32 gmx_simd_ref_checkbitmask_epi32
-#endif
-
-/* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
-#define gmx_castsi_pr gmx_simd_ref_castsi_pr
-/* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
- * identical 32-bit masks are set in one double and one or both can be used.
- */
-#define gmx_checkbitmask_pr gmx_simd_ref_checkbitmask_pr
-#endif
-
/* Conversions only used for PME table lookup */
#define gmx_cvttpr_epi32 gmx_simd_ref_cvttpr_epi32
#define gmx_cvtepi32_pr gmx_simd_ref_cvtepi32_pr
#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm_movemask_ps
-#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#define gmx_set1_epi32 _mm_set1_epi32
-#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
-#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
-
#define gmx_cvttpr_epi32 _mm_cvttps_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_ps
#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm_movemask_pd
-#define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#define gmx_set1_epi32 _mm_set1_epi32
-#define gmx_load_si(i) _mm_load_si128((__m128i *) (i))
-#define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
-
#define gmx_cvttpr_epi32 _mm_cvttpd_epi32
#define gmx_cvtepi32_pr _mm_cvtepi32_pd
#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm256_movemask_ps
-#define GMX_SIMD_HAVE_CHECKBITMASK_PR
-#define gmx_set1_epi32 _mm256_set1_epi32
-#define gmx_castsi_pr _mm256_castsi256_ps
-/* With <= 16 bits used the cast and conversion should not be required,
- * since only mantissa bits are set and that would give a non-zero float,
- * but with the Intel compiler this does not work correctly.
- */
-#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
-
#define gmx_cvttpr_epi32 _mm256_cvttps_epi32
#define gmx_rsqrt_pr _mm256_rsqrt_ps
#define gmx_acos_pr gmx_mm256_acos_ps
#define gmx_atan2_pr gmx_mm256_atan2_ps
-#else
+#else /* ifndef GMX_DOUBLE */
#define GMX_SIMD_WIDTH_HERE 4
#define GMX_SIMD_HAVE_ANYTRUE
#define gmx_anytrue_pb _mm256_movemask_pd
-#define GMX_SIMD_HAVE_CHECKBITMASK_PR
-#define gmx_set1_epi32 _mm256_set1_epi32
-#define gmx_castsi_pr _mm256_castsi256_pd
-/* With <= 16 bits used the cast and conversion should not be required,
- * since only mantissa bits are set and that would give a non-zero float,
- * but with the Intel compiler this does not work correctly.
- * Because AVX does not have int->double conversion, we convert via float.
- */
-#define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
-
#define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
#define gmx_rsqrt_pr(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
#define gmx_acos_pr gmx_mm256_acos_pd
#define gmx_atan2_pr gmx_mm256_atan2_pd
-#endif /* GMX_DOUBLE */
+#endif /* ifndef GMX_DOUBLE */
#endif /* 128- or 256-bit x86 SIMD */
}
};
-
-/* For topology exclusion pair checking we need: ((a & b) ? True : False)
- * when we do a bit-wise and between a and b.
- * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
- * Otherwise we do all operations, except for the set1, in reals.
- */
-
-/* Integer set and cast are only used for nbnxn exclusion masks */
-static gmx_inline gmx_simd_ref_epi32
-gmx_simd_ref_set1_epi32(int src)
-{
- gmx_simd_ref_epi32 a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = src;
- }
-
- return a;
-}
-
-static gmx_inline gmx_simd_ref_epi32
-gmx_simd_ref_load_si(const int *src)
-{
- gmx_simd_ref_epi32 a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = src[i];
- }
-
- return a;
-}
-
-/* If the same bit is set in both input masks, return TRUE, else FALSE.
- * This function is only called with a single bit set in b.
- */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_checkbitmask_epi32(gmx_simd_ref_epi32 a, gmx_simd_ref_epi32 b)
-{
- gmx_simd_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = ((a.r[i] & b.r[i]) != 0);
- }
-
- return c;
-}
-
-
/* Conversions only used for PME table lookup */
static gmx_inline gmx_simd_ref_epi32
gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
#error "unsupported SIMD width"
#endif
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
+
+/* The stride of all the atom data arrays is equal to half the SIMD width */
+#define STRIDE (GMX_SIMD_WIDTH_HERE/2)
+
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#if GMX_SIMD_WIDTH_HERE == 16
+/* This is getting ridiculous, SIMD horizontal adds would help,
+ * but this is not performance critical (only used to reduce energies)
+ */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+
+
+#include "nbnxn_kernel_simd_utils.h"
+
+static inline void
+gmx_load_simd_2xnn_interactions(int excl,
+ gmx_exclfilter filter_S0,
+ gmx_exclfilter filter_S2,
+ gmx_mm_pb *interact_S0,
+ gmx_mm_pb *interact_S2)
+{
+ /* Load integer topology exclusion interaction mask */
+ gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
+ *interact_S0 = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
+ *interact_S2 = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
+}
/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- {
- /* Load integer topology exclusion interaction mask */
- gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
-
- interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
- interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
- }
-#else
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
- {
- /* Integer mask set, cast to real and real mask operations */
- gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
-
- interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
- interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
- }
-#else
-#error "No SIMD bitmask operation available"
-#endif
-#endif
+ gmx_load_simd_2xnn_interactions(l_cj[cjind].excl, filter_S0, filter_S2, &interact_S0, &interact_S2);
#endif /* CHECK_EXCLS */
/* load j atom coordinates */
#endif
#endif
#else /* EXCL_FORCES */
- /* No exclusion forces: remove all excluded atom pairs from the list */
+ /* No exclusion forces: remove all excluded atom pairs from the list */
wco_S0 = gmx_and_pb(wco_S0, interact_S0);
wco_S2 = gmx_and_pb(wco_S2, interact_S2);
#endif
*/
-#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
-
-#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
-
-/* The stride of all the atom data arrays is equal to half the SIMD width */
-#define STRIDE (GMX_SIMD_WIDTH_HERE/2)
-
-#if GMX_SIMD_WIDTH_HERE == 8
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
-#else
-#if GMX_SIMD_WIDTH_HERE == 16
-/* This is getting ridiculous, SIMD horizontal adds would help,
- * but this is not performance critical (only used to reduce energies)
- */
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
-#else
-#error "unsupported kernel configuration"
-#endif
-#endif
-
-
-#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
-/* AVX-256 single precision 2x(4+4) kernel,
- * we can do half SIMD-width aligned FDV0 table loads.
- */
-#define TAB_FDV0
-#endif
-
-/* Currently stride 4 for the 2 LJ parameters is hard coded */
-#define NBFP_STRIDE 4
-
-
-#include "nbnxn_kernel_simd_utils.h"
-
/* All functionality defines are set here, except for:
* CALC_ENERGIES, ENERGY_GROUPS which are defined before.
* CHECK_EXCLS, which is set just before including the inner loop contents.
const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
real facel;
real *nbfp_ptr;
- int nbfp_stride;
int n, ci, ci_sh;
int ish, ish3;
gmx_bool do_LJ, half_LJ, do_coul;
gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S2;
#endif
- unsigned *excl_filter;
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- gmx_epi32 filter_S0, filter_S2;
-#else
- gmx_mm_pr filter_S0, filter_S2;
-#endif
+ unsigned *exclusion_filter;
+ gmx_exclfilter filter_S0, filter_S2;
gmx_mm_pr zero_S = gmx_set1_pr(0);
ljc = nbat->lj_comb;
#else
/* No combination rule used */
-#if NBFP_STRIDE == 2
- nbfp_ptr = nbat->nbfp;
-#else
-#if NBFP_STRIDE == 4
- nbfp_ptr = nbat->nbfp_s4;
-#else
-#error "Only NBFP_STRIDE 2 and 4 are currently supported"
-#endif
-#endif
- nbfp_stride = NBFP_STRIDE;
+ nbfp_ptr = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
#endif
/* Load j-i for the first i */
#endif
#endif
- /* Load masks for topology exclusion masking */
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
-#else
-#ifdef GMX_DOUBLE
-#define FILTER_STRIDE 2
-#else
-#define FILTER_STRIDE 1
-#endif
-#endif
-#if FILTER_STRIDE == 1
- excl_filter = nbat->simd_exclusion_filter1;
-#else
- excl_filter = nbat->simd_exclusion_filter2;
-#endif
- /* Here we cast the exclusion filters from unsigned * to int * or real *.
- * Since we only check bits, the actual value they represent does not
- * matter, as long as both filter and mask data are treated the same way.
+ /* Load masks for topology exclusion masking. filter_stride is
+ static const, so the conditional will be optimized away. */
+ if (1 == filter_stride)
+ {
+ exclusion_filter = nbat->simd_exclusion_filter1;
+ }
+ else /* (2 == filter_stride) */
+ {
+ exclusion_filter = nbat->simd_exclusion_filter2;
+ }
+
+ /* Here we cast the exclusion masks from unsigned * to int * or
+ * real *. Since we only check bits, the actual value they
+ * represent does not matter, as long as both mask and exclusion
+ * info are treated the same way.
*/
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
- filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
-#else
- filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
- filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
-#endif
-#undef FILTER_STRIDE
+ filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*2*UNROLLJ*filter_stride);
+ filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 1*2*UNROLLJ*filter_stride);
#ifdef CALC_COUL_TAB
/* Generate aligned table index pointers */
#undef CALC_SHIFTFORCES
-
-#undef UNROLLI
-#undef UNROLLJ
-#undef STRIDE
-#undef TAB_FDV0
-#undef NBFP_STRIDE
#error "unsupported SIMD width"
#endif
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ GMX_SIMD_WIDTH_HERE
+
+/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+#define STRIDE GMX_SIMD_WIDTH_HERE
+#else
+#define STRIDE UNROLLI
+#endif
+
+#if GMX_SIMD_WIDTH_HERE == 2
+#define SUM_SIMD(x) (x[0]+x[1])
+#else
+#if GMX_SIMD_WIDTH_HERE == 4
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#else
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#error "unsupported kernel configuration"
+#endif
+#endif
+#endif
+
+
+#include "nbnxn_kernel_simd_utils.h"
+
+static inline void
+gmx_load_simd_4xn_interactions(int excl,
+ gmx_exclfilter filter_S0,
+ gmx_exclfilter filter_S1,
+ gmx_exclfilter filter_S2,
+ gmx_exclfilter filter_S3,
+ gmx_mm_pb *interact_S0,
+ gmx_mm_pb *interact_S1,
+ gmx_mm_pb *interact_S2,
+ gmx_mm_pb *interact_S3)
+{
+ /* Load integer interaction mask */
+ gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
+ *interact_S0 = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
+ *interact_S1 = gmx_checkbitmask_pb(mask_pr_S, filter_S1);
+ *interact_S2 = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
+ *interact_S3 = gmx_checkbitmask_pb(mask_pr_S, filter_S3);
+}
/* Include all flavors of the SSE or AVX 4xN kernel loops */
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- {
- /* Load integer topology exclusion interaction mask */
- gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
-
- interact_S0 = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
- interact_S1 = gmx_checkbitmask_epi32(mask_pr_S, filter_S1);
- interact_S2 = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
- interact_S3 = gmx_checkbitmask_epi32(mask_pr_S, filter_S3);
- }
-#else
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
- {
- /* Integer mask set, cast to real and real mask operations */
- gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
-
- interact_S0 = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
- interact_S1 = gmx_checkbitmask_pr(mask_pr_S, filter_S1);
- interact_S2 = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
- interact_S3 = gmx_checkbitmask_pr(mask_pr_S, filter_S3);
- }
-#else
-#error "No SIMD bitmask operation available"
-#endif
-#endif
+ gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, &interact_S0, &interact_S1, &interact_S2, &interact_S3);
#endif /* CHECK_EXCLS */
/* load j atom coordinates */
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
-
-#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ GMX_SIMD_WIDTH_HERE
-
-/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
-#if GMX_SIMD_WIDTH_HERE >= UNROLLI
-#define STRIDE GMX_SIMD_WIDTH_HERE
-#else
-#define STRIDE UNROLLI
-#endif
-
-#if GMX_SIMD_WIDTH_HERE == 2
-#define SUM_SIMD(x) (x[0]+x[1])
-#else
-#if GMX_SIMD_WIDTH_HERE == 4
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#else
-#if GMX_SIMD_WIDTH_HERE == 8
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
-#else
-#error "unsupported kernel configuration"
-#endif
-#endif
-#endif
-
-
-/* Decide if we should use the FDV0 table layout */
-#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
-/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
-#if GMX_SIMD_WIDTH_HERE/2 == 4
-#define TAB_FDV0
-#endif
-#else
-/* We use the FDV0 table layout when we can use aligned table loads */
-#if GMX_SIMD_WIDTH_HERE == 4
-#define TAB_FDV0
-#endif
-#endif
-
-/* Decide the stride for the 2 LJ parameters */
-#ifdef GMX_X86_SSE2
-#ifdef GMX_DOUBLE
-#define NBFP_STRIDE 2
-#else
-#define NBFP_STRIDE 4
-#endif
-#else
-#if GMX_SIMD_WIDTH_HERE > 4
-#define NBFP_STRIDE 4
-#else
-#define NBFP_STRIDE GMX_SIMD_WIDTH_HERE
-#endif
-#endif
-
-
-#include "nbnxn_kernel_simd_utils.h"
-
/* All functionality defines are set here, except for:
* CALC_ENERGIES, ENERGY_GROUPS which are defined before.
* CHECK_EXCLS, which is set just before including the inner loop contents.
const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
real facel;
real *nbfp_ptr;
- int nbfp_stride;
int n, ci, ci_sh;
int ish, ish3;
gmx_bool do_LJ, half_LJ, do_coul;
gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
#endif
- unsigned *excl_filter;
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- gmx_epi32 filter_S0, filter_S1, filter_S2, filter_S3;
-#else
- gmx_mm_pr filter_S0, filter_S1, filter_S2, filter_S3;
-#endif
+ unsigned *exclusion_filter;
+ gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
gmx_mm_pr zero_S = gmx_set1_pr(0);
ljc = nbat->lj_comb;
#else
/* No combination rule used */
-#if NBFP_STRIDE == 2
- nbfp_ptr = nbat->nbfp;
-#else
-#if NBFP_STRIDE == 4
- nbfp_ptr = nbat->nbfp_s4;
-#else
-#error "Only NBFP_STRIDE 2 and 4 are currently supported"
-#endif
-#endif
- nbfp_stride = NBFP_STRIDE;
+ nbfp_ptr = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
#endif
/* Load j-i for the first i */
#endif
#endif
- /* Load masks for topology exclusion masking */
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-#define FILTER_STRIDE (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
-#else
-#ifdef GMX_DOUBLE
-#define FILTER_STRIDE 2
-#else
-#define FILTER_STRIDE 1
-#endif
-#endif
-#if FILTER_STRIDE == 1
- excl_filter = nbat->simd_exclusion_filter1;
-#else
- excl_filter = nbat->simd_exclusion_filter2;
-#endif
+ /* Load masks for topology exclusion masking. filter_stride is
+ static const, so the conditional will be optimized away. */
+ if (1 == filter_stride)
+ {
+ exclusion_filter = nbat->simd_exclusion_filter1;
+ }
+ else /* (2 == filter_stride) */
+ {
+ exclusion_filter = nbat->simd_exclusion_filter2;
+ }
+
/* Here we cast the exclusion filters from unsigned * to int * or real *.
* Since we only check bits, the actual value they represent does not
* matter, as long as both filter and mask data are treated the same way.
*/
-#ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- filter_S0 = gmx_load_si((int *)excl_filter + 0*UNROLLJ*FILTER_STRIDE);
- filter_S1 = gmx_load_si((int *)excl_filter + 1*UNROLLJ*FILTER_STRIDE);
- filter_S2 = gmx_load_si((int *)excl_filter + 2*UNROLLJ*FILTER_STRIDE);
- filter_S3 = gmx_load_si((int *)excl_filter + 3*UNROLLJ*FILTER_STRIDE);
-#else
- filter_S0 = gmx_load_pr((real *)excl_filter + 0*UNROLLJ);
- filter_S1 = gmx_load_pr((real *)excl_filter + 1*UNROLLJ);
- filter_S2 = gmx_load_pr((real *)excl_filter + 2*UNROLLJ);
- filter_S3 = gmx_load_pr((real *)excl_filter + 3*UNROLLJ);
-#endif
-#undef FILTER_STRIDE
+ filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
+ filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
+ filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
+ filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
#ifdef CALC_COUL_TAB
/* Generate aligned table index pointers */
#endif
#undef CALC_SHIFTFORCES
-
-#undef UNROLLI
-#undef UNROLLJ
-#undef STRIDE
-#undef TAB_FDV0
-#undef NBFP_STRIDE
-
-#undef GMX_USE_HALF_WIDTH_SIMD_HERE
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef _nbnxn_kernel_sse_utils_h_
-#define _nbnxn_kernel_sse_utils_h_
+#ifndef _nbnxn_kernel_simd_utils_h_
+#define _nbnxn_kernel_simd_utils_h_
-/* This files contains all functions/macros for the SIMD kernels
- * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+/*! \brief Provides hardware-specific utility routines for the SIMD kernels.
+ *
+ * Defines all functions, typedefs, constants and macros that have
+ * explicit dependencies on the j-cluster size, precision, or SIMD
+ * width. This includes handling diagonal, Newton and topology
+ * exclusions.
+ *
* The functionality which depends on the j-cluster size is:
* LJ-parameter lookup
* force table lookup
* energy group pair energy storage
*/
+#if !defined GMX_NBNXN_SIMD_2XNN && !defined GMX_NBNXN_SIMD_4XN
+#error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions"
+#endif
-/* Include SIMD architecture specific versions of the 4/5 functions above */
#ifdef GMX_SIMD_REFERENCE_PLAIN_C
+
#include "nbnxn_kernel_simd_utils_ref.h"
-#else
+
+#else /* GMX_SIMD_REFERENCE_PLAIN_C */
+
#ifdef GMX_X86_SSE2
/* Include x86 SSE2 compatible SIMD functions */
+/* Set the stride for the lookup of the two LJ parameters from their
+ (padded) array. Only strides of 2 and 4 are currently supported. */
+#if defined GMX_NBNXN_SIMD_2XNN
+static const int nbfp_stride = 4;
+#elif defined GMX_DOUBLE
+static const int nbfp_stride = 2;
+#else
+static const int nbfp_stride = 4;
+#endif
+
/* Align a stack-based thread-local working array. Table loads on
* full-width AVX_256 use the array, but other implementations do
* not. */
#endif
}
-
#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE == 8
+#define TAB_FDV0
+#endif
+
+/*
+Berk, 2xnn.c had the following code, but I think it is safe to remove now, given the code immediately above.
+
+#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
+/ * AVX-256 single precision 2x(4+4) kernel,
+ * we can do half SIMD-width aligned FDV0 table loads.
+ * /
+#define TAB_FDV0
+#endif
+*/
+
#ifdef GMX_DOUBLE
#include "nbnxn_kernel_simd_utils_x86_256d.h"
-#else
+#else /* GMX_DOUBLE */
#include "nbnxn_kernel_simd_utils_x86_256s.h"
+#endif /* GMX_DOUBLE */
+
+#else /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
#endif
-#else
+
#ifdef GMX_DOUBLE
#include "nbnxn_kernel_simd_utils_x86_128d.h"
-#else
+#else /* GMX_DOUBLE */
#include "nbnxn_kernel_simd_utils_x86_128s.h"
+#endif /* GMX_DOUBLE */
+
+#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
+
+#else /* GMX_X86_SSE2 */
+
+#if GMX_SIMD_WIDTH_HERE > 4
+static const int nbfp_stride = 4;
+#else
+static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
#endif
-#endif
-#endif
-#endif
+
+#endif /* GMX_X86_SSE2 */
+#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
#ifdef UNROLLJ
}
#endif
-#endif /* _nbnxn_kernel_sse_utils_h_ */
+#endif /* _nbnxn_kernel_simd_utils_h_ */
#ifndef _nbnxn_kernel_simd_utils_ref_h_
#define _nbnxn_kernel_simd_utils_ref_h_
-/* This files contains all functions/macros for the SIMD kernels
- * which have explicit dependencies on the j-cluster size and/or SIMD-width.
- * The functionality which depends on the j-cluster size is:
- * LJ-parameter lookup
- * force table lookup
- * energy group pair energy storage
- */
-
+typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter;
+#define gmx_exclfilter gmx_simd_ref_exclfilter
+static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
#if GMX_SIMD_WIDTH_HERE > 4
/* The 4xn kernel operates on 4-wide i-force registers */
}
#endif
-#ifdef NBFP_STRIDE
static gmx_inline void
load_lj_pair_params(const real *nbfp, const int *type, int aj,
gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
{
- c6_S->r[i] = nbfp[type[aj+i]*NBFP_STRIDE];
- c12_S->r[i] = nbfp[type[aj+i]*NBFP_STRIDE+1];
+ c6_S->r[i] = nbfp[type[aj+i]*nbfp_stride];
+ c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
}
}
for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
{
- c6_S->r[i] = nbfp0[type[aj+i]*NBFP_STRIDE];
- c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*NBFP_STRIDE];
- c12_S->r[i] = nbfp0[type[aj+i]*NBFP_STRIDE+1];
- c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*NBFP_STRIDE+1];
+ c6_S->r[i] = nbfp0[type[aj+i]*nbfp_stride];
+ c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride];
+ c12_S->r[i] = nbfp0[type[aj+i]*nbfp_stride+1];
+ c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
}
}
#endif
-#endif
+
+/* Code for handling loading exclusions and converting them into
+ interactions. The x86 code might use either integer- or real-type
+ SIMD, but the reference code does not need to know. */
+
+#define gmx_load1_exclfilter(e) gmx_simd_ref_load1_exclfilter(e)
+#define gmx_load_exclusion_filter(e) gmx_simd_ref_load_exclusion_filter((int *) e)
+#define gmx_checkbitmask_pb(m0, m1) gmx_simd_ref_checkbitmask_pb(m0, m1)
+
+static gmx_inline gmx_simd_ref_exclfilter
+gmx_simd_ref_load1_exclfilter(int src)
+{
+ gmx_simd_ref_exclfilter a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = src;
+ }
+
+ return a;
+}
+
+static gmx_inline gmx_simd_ref_exclfilter
+gmx_simd_ref_load_exclusion_filter(const unsigned *src)
+{
+ gmx_simd_ref_exclfilter a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ a.r[i] = src[i];
+ }
+
+ return a;
+}
+
+/* For topology exclusion-pair checking we need: ((a & b) ? True :
+ * False). The x86 implementations use hardware-suitable integer-
+ * and/or real-valued SIMD operations and a bit-wise "and" to do
+ * this. The reference implementation normally uses logical operations
+ * for logic, but in this case the i- and j-atom exclusion masks
+ * computed during searching expect to be combined with bit-wise
+ * "and".
+ *
+ * If the same bit is set in both input masks, return TRUE, else
+ * FALSE. This function is only called with a single bit set in b.
+ */
+static gmx_inline gmx_simd_ref_pb
+gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
+{
+ gmx_simd_ref_pb c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ {
+ c.r[i] = ((a.r[i] & b.r[i]) != 0);
+ }
+
+ return c;
+}
#endif /* _nbnxn_kernel_simd_utils_ref_h_ */
* energy group pair energy storage
*/
+#define gmx_exclfilter gmx_epi32
+static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+
/* Transpose 2 double precision registers */
static gmx_inline void
gmx_mm_transpose2_op_pd(__m128d in0, __m128d in1,
for (p = 0; p < UNROLLJ; p++)
{
- clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);
+ clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*nbfp_stride);
}
gmx_mm_transpose2_op_pd(clj_S[0], clj_S[1], c6_S, c12_S);
}
*ctabv_S = _mm_shuffle_pd(ctab_S[2], ctab_S[3], _MM_SHUFFLE2(0, 0));
}
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+ return _mm_set1_epi32(e);
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+ return _mm_load_si128((__m128i *) i);
+}
+
+static gmx_inline gmx_mm_pb
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+ return gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+}
+
#endif /* _nbnxn_kernel_simd_utils_x86_s128d_h_ */
* energy group pair energy storage
*/
+#define gmx_exclfilter gmx_epi32
+static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
static gmx_inline void
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
for (p = 0; p < UNROLLJ; p++)
{
/* Here we load 4 aligned floats, but we need just 2 */
- clj_S[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);
+ clj_S[p] = gmx_load_pr(nbfp+type[aj+p]*nbfp_stride);
}
gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3], c6_S, c12_S);
}
*ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3]);
}
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+ return _mm_set1_epi32(e);
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+ return _mm_load_si128((__m128i *) i);
+}
+
+static gmx_inline gmx_mm_pb
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+ return gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+}
+
#endif /* _nbnxn_kernel_simd_utils_x86_s128s_h_ */
* energy group pair energy storage
*/
+#define gmx_exclfilter gmx_mm_pr
+static const int filter_stride = 2;
+
/* Transpose 2 double precision registers */
static gmx_inline void
gmx_mm_transpose2_op_pd(__m128d in0, __m128d in1,
for (p = 0; p < UNROLLJ; p++)
{
- clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);
+ clj_S[p] = _mm_load_pd(nbfp+type[aj+p]*nbfp_stride);
}
gmx_mm_transpose2_op_pd(clj_S[0], clj_S[1], &c6t_S[0], &c12t_S[0]);
gmx_mm_transpose2_op_pd(clj_S[2], clj_S[3], &c6t_S[1], &c12t_S[1]);
*ctabv_S = gmx_2_m128d_to_m256d(_mm_shuffle_pd(ctab_S[4], ctab_S[5], _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(ctab_S[6], ctab_S[7], _MM_SHUFFLE2(0, 0)));
}
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+ return _mm256_castsi256_pd(_mm256_set1_epi32(e));
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+ return gmx_load_pr((real *) (i));
+}
+
+static gmx_inline gmx_mm_pb
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+ /* With <= 16 bits used the cast and conversion should not be
+ * required, since only mantissa bits are set and that would give
+ * a non-zero float, but with the Intel compiler this does not
+ * work correctly. Because AVX does not have int->double
+ * conversion, we convert via float. */
+ return _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c);
+}
+
#endif /* _nbnxn_kernel_simd_utils_x86_s256d_h_ */
* energy group pair energy storage
*/
+#define gmx_exclfilter gmx_mm_pr
+static const int filter_stride = 1;
/* The 4xn kernel operates on 4-wide i-force registers */
#define gmx_mm_pr4 __m128
#define gmx_add_pr4 _mm_add_ps
+#ifdef GMX_NBNXN_SIMD_2XNN
/* Half-width operations are required for the 2xnn kernels */
/* Half-width SIMD real type */
*c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
}
+#endif /* GMX_NBNXN_SIMD_2XNN */
+
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
static gmx_inline void
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
for (p = 0; p < UNROLLJ; p++)
{
/* Here we load 4 aligned floats, but we need just 2 */
- clj_S[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);
+ clj_S[p] = _mm_load_ps(nbfp+type[aj+p]*nbfp_stride);
}
gmx_shuffle_4_ps_fil01_to_2_ps(clj_S[0], clj_S[1], clj_S[2], clj_S[3],
&c6t_S[0], &c12t_S[0]);
for (p = 0; p < UNROLLJ; p++)
{
/* Here we load 4 aligned floats, but we need just 2 */
- clj_S0[p] = _mm_load_ps(nbfp0+type[aj+p]*NBFP_STRIDE);
+ clj_S0[p] = _mm_load_ps(nbfp0+type[aj+p]*nbfp_stride);
}
for (p = 0; p < UNROLLJ; p++)
{
/* Here we load 4 aligned floats, but we need just 2 */
- clj_S1[p] = _mm_load_ps(nbfp1+type[aj+p]*NBFP_STRIDE);
+ clj_S1[p] = _mm_load_ps(nbfp1+type[aj+p]*nbfp_stride);
}
gmx_shuffle_4_ps_fil01_to_2_ps(clj_S0[0], clj_S0[1], clj_S0[2], clj_S0[3],
&c6t_S[0], &c12t_S[0]);
*ctabv_S = gmx_2_mm_to_m256(ctabvt_S[0], ctabvt_S[1]);
}
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+ return _mm256_castsi256_ps(_mm256_set1_epi32(e));
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+ return gmx_load_pr((real *) (i));
+}
+
+static gmx_inline gmx_mm_pb
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+ return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
+}
+
#endif /* _nbnxn_kernel_simd_utils_x86_s256s_h_ */