Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_atomdata.c
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index c01ffb2b4eedb800fdba45d4e423e596fd6fe085..10aa13a1a10a0f08276923f5964de027d9e1ac27 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -664,7 +664,7 @@ void nbnxn_atomdata_init(FILE *fp,
           * In the kernel we can subtract 1 to generate the subsequent mask.
           */
          const int simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
-        int       simd_4xn_diag_size, j;
+        int       simd_4xn_diag_size, real_excl, simd_excl_size, j, s;
  
          simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
          snew_aligned(nbat->simd_4xn_diag, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
@@ -681,6 +681,26 @@ void nbnxn_atomdata_init(FILE *fp,
              /* The next half of the SIMD width is for i + 1 */
              nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
          }
+
+        /* We always use 32-bit integer exclusion masks. When we use
+         * double precision, we fit two integers in a double SIMD register.
+         */
+        real_excl = sizeof(real)/sizeof(*nbat->simd_excl_mask);
+        /* Set bits for use with both 4xN and 2x(N+N) kernels */
+        simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width*real_excl;
+        snew_aligned(nbat->simd_excl_mask, simd_excl_size*real_excl, NBNXN_MEM_ALIGN);
+        for (j = 0; j < simd_excl_size; j++)
+        {
+            /* Set the consecutive bits for masking pair exclusions.
+             * For double a single-bit mask would be enough.
+             * But using two bits avoids endianness issues.
+             */
+            for (s = 0; s < real_excl; s++)
+            {
+                /* Set the consecutive bits for masking pair exclusions */
+                nbat->simd_excl_mask[j*real_excl + s] = (1U << j);
+            }
+        }
      }
  #endif
  
@@ -1066,11 +1086,8 @@ nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
  /* The SIMD width here is actually independent of that in the kernels,
   * but we use the same width for simplicity (usually optimal anyhow).
   */
-#if GMX_NBNXN_SIMD_BITWIDTH == 128
-#define GMX_MM128_HERE
-#endif
-#if GMX_NBNXN_SIMD_BITWIDTH == 256
-#define GMX_MM256_HERE
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
  #endif
  #include "gmx_simd_macros.h"
  
@@ -1103,9 +1120,6 @@ nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
              gmx_store_pr(dest+i, dest_SSE);
          }
      }
-
-#undef GMX_MM128_HERE
-#undef GMX_MM256_HERE
  #endif
  }