#include "nbnxn_consts.h"
#include "nbnxn_internal.h"
#include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
#include "gmx_omp_nthreads.h"
/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
}
}
+#ifdef GMX_NBNXN_SIMD
+static void
+nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
+{
+ int i, j;
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ int simd_excl_size;
+ /* Set the diagonal cluster pair exclusion mask setup data.
+ * In the kernel we check 0 < j - i to generate the masks.
+ * Here we store j - i for generating the mask for the first i,
+ * we substract 0.5 to avoid rounding issues.
+ * In the kernel we can subtract 1 to generate the subsequent mask.
+ */
+ int simd_4xn_diag_size;
+ const real simdFalse = -1, simdTrue = 1;
+ real *simd_interaction_array;
+
+ simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
+ snew_aligned(nbat->simd_4xn_diagonal_j_minus_i, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_4xn_diag_size; j++)
+ {
+ nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
+ }
+
+ snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i, simd_width, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_width/2; j++)
+ {
+ /* The j-cluster size is half the SIMD width */
+ nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
+ /* The next half of the SIMD width is for i + 1 */
+ nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
+ }
+
+ /* We use up to 32 bits for exclusion masking.
+ * The same masks are used for the 4xN and 2x(N+N) kernels.
+ * The masks are read either into epi32 SIMD registers or into
+ * real SIMD registers (together with a cast).
+ * In single precision this means the real and epi32 SIMD registers
+ * are of equal size.
+ * In double precision the epi32 registers can be smaller than
+ * the real registers, so depending on the architecture, we might
+ * need to use two, identical, 32-bit masks per real.
+ */
+ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
+ snew_aligned(nbat->simd_exclusion_filter1, simd_excl_size, NBNXN_MEM_ALIGN);
+ snew_aligned(nbat->simd_exclusion_filter2, simd_excl_size*2, NBNXN_MEM_ALIGN);
+
+ for (j = 0; j < simd_excl_size; j++)
+ {
+ /* Set the consecutive bits for masking pair exclusions */
+ nbat->simd_exclusion_filter1[j] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
+ }
+
+#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+ /* The QPX kernels shouldn't do the bit masking that is done on
+ * x86, because the SIMD units lack bit-wise operations. Instead,
+ * we generate a vector of all 2^4 possible ways an i atom
+ * interacts with its 4 j atoms. Each array entry contains
+ * simd_width signed ints that are read in a single SIMD
+ * load. These ints must contain values that will be interpreted
+ * as true and false when loaded in the SIMD floating-point
+ * registers, ie. any positive or any negative value,
+ * respectively. Each array entry encodes how this i atom will
+ * interact with the 4 j atoms. Matching code exists in
+ * set_ci_top_excls() to generate indices into this array. Those
+ * indices are used in the kernels. */
+
+ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ const int qpx_simd_width = GMX_SIMD_WIDTH_HERE;
+ snew_aligned(simd_interaction_array, simd_excl_size * qpx_simd_width, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_excl_size; j++)
+ {
+ int index = j * qpx_simd_width;
+ for (i = 0; i < qpx_simd_width; i++)
+ {
+ simd_interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse;
+ }
+ }
+ nbat->simd_interaction_array = simd_interaction_array;
+#endif
+}
+#endif
+
/* Initializes an nbnxn_atomdata_t data structure */
void nbnxn_atomdata_init(FILE *fp,
nbnxn_atomdata_t *nbat,
#ifdef GMX_NBNXN_SIMD
if (simple)
{
- /* Set the diagonal cluster pair interaction mask setup data.
- * In the kernel we check 0 < j - i to generate the masks.
- * Here we store j - i for generating the mask for the first i (i=0);
- * we substract 0.5 to avoid rounding issues.
- * In the kernel we can subtract 1 to generate the mask for the next i.
- */
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- int simd_4xn_diag_ind_size, simd_interaction_size, j;
-
- simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
- snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
- simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_4xn_diag_ind_size; j++)
- {
- nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
- }
-
- snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
- simd_width, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_width/2; j++)
- {
- /* The j-cluster size is half the SIMD width */
- nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
- /* The next half of the SIMD width is for i + 1 */
- nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
- }
-
- /* We use up to 32 bits for exclusion masking.
- * The same masks are used for the 4xN and 2x(N+N) kernels.
- * The masks are read either into epi32 SIMD registers or into
- * real SIMD registers (together with a cast).
- * In single precision this means the real and epi32 SIMD registers
- * are of equal size.
- * In double precision the epi32 registers can be smaller than
- * the real registers, so depending on the architecture, we might
- * need to use two, identical, 32-bit masks per real.
- */
- simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
- snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size, NBNXN_MEM_ALIGN);
- snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
-
- for (j = 0; j < simd_interaction_size; j++)
- {
- /* Set the consecutive bits for filters pair exclusions masks */
- nbat->simd_exclusion_filter1[j] = (1U << j);
- nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
- nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
- }
+ nbnxn_atomdata_init_simple_exclusion_masks(nbat);
}
#endif