prepared the nbnxn code for non-x86 SIMD

author Berk Hess <hess@kth.se>

Tue, 8 Jan 2013 10:46:40 +0000 (11:46 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Sat, 12 Jan 2013 21:32:41 +0000 (22:32 +0100)
author Berk Hess <hess@kth.se>
Tue, 8 Jan 2013 10:46:40 +0000 (11:46 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Sat, 12 Jan 2013 21:32:41 +0000 (22:32 +0100)
diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h

index 2cd5ed91cb987e38ea40b554b22f042ace3c7c7c..074b1bad06f592700d45ea8d0591dc316d448c20 100644 (file)
--- a/include/gmx_simd_macros.h
+++ b/include/gmx_simd_macros.h
@@ -45,12 +45,13 @@
   * with different settings from the same source file.
   */
  
-/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
+/* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
  
  #undef GMX_SIMD_WIDTH_HERE
  
  #undef gmx_epi32
  
+/* float/double SIMD register type */
  #undef gmx_mm_pr
  
  #undef gmx_load_pr
@@ -58,6 +59,7 @@
  #undef gmx_set1_pr
  #undef gmx_setzero_pr
  #undef gmx_store_pr
+/* Only used for debugging */
  #undef gmx_storeu_pr
  
  #undef gmx_add_pr
@@ -69,13 +71,18 @@
  #undef gmx_or_pr
  #undef gmx_andnot_pr
  
+/* Only used to speed up the nbnxn tabulated PME kernels */
  #undef gmx_floor_pr
+/* Only used with x86 when blendv is faster than comparison */
  #undef gmx_blendv_pr
  
  #undef gmx_movemask_pr
  
+/* Integer casts are only used for nbnxn x86 exclusion masks */
  #undef gmx_mm_castsi128_pr
+#undef gmx_mm_castsi256_pr
  
+/* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
  #undef gmx_cvttpr_epi32
  #undef gmx_cvtepi32_pr
  
@@ -83,10 +90,25 @@
  #undef gmx_calc_rsq_pr
  #undef gmx_sum4_pr
  
+/* Only required for nbnxn analytical PME kernels */
  #undef gmx_pmecorrF_pr
  #undef gmx_pmecorrV_pr
  
  
+/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
+#undef gmx_mm_hpr
+
+#undef gmx_load_hpr
+#undef gmx_load1_hpr
+#undef gmx_store_hpr
+#undef gmx_add_hpr
+#undef gmx_sub_hpr
+
+#undef gmx_sum4_hpr
+
+#undef gmx_2hpr_to_pr
+
+
  /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
   * the same intrinsics, with defines, can be compiled for either 128 or 256
   * bit wide SSE or AVX instructions.
@@ -104,6 +126,9 @@
  #error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
  #endif
  
+
+#ifdef GMX_X86_SSE2
+
  #ifdef GMX_MM128_HERE
  
  #define gmx_epi32  __m128i
@@ -304,3 +329,5 @@
  #endif
  
  #endif /* GMX_MM256_HERE */
+
+#endif /* GMX_X86_SSE2 */
diff --git a/include/types/nb_verlet.h b/include/types/nb_verlet.h

index 8a33375c03f04564c62fabe666a5a2ddcf63f8a3..64bfe00d3e9ac3943da730e62462dd7b5c3a7e93 100644 (file)
--- a/include/types/nb_verlet.h
+++ b/include/types/nb_verlet.h
@@ -51,15 +51,16 @@ extern "C" {
  #define GMX_NBNXN_SIMD
  
  #ifdef GMX_X86_AVX_256
-/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+/* Note that setting this to 128 will also work with AVX-256, but slower */
  #define GMX_NBNXN_SIMD_BITWIDTH  256
  #else
  #define GMX_NBNXN_SIMD_BITWIDTH  128
  #endif
  
  /* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense and are only implemented
- * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
   */
  #define GMX_NBNXN_SIMD_4XN
  #if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
@@ -81,17 +82,22 @@ typedef enum
      nbnxnkNR
  } nbnxn_kernel_type;
  
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
+/* Define the nbnxn kernel names for all different types defined above */
  static const char *nbnxn_kernel_name[nbnxnkNR] =
-  { "not set", "plain C",
-#if !(defined GMX_X86_SSE2)
+{
+    "not set",
+    "plain C",
+#ifndef GMX_NBNXN_SIMD
      "not available", "not available",
  #else
+#ifdef GMX_X86_SSE2
  #if GMX_NBNXN_SIMD_BITWIDTH == 128
+/* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+ * on compiler flags. As we use nearly identical intrinsics, using an AVX
+ * compiler flag without an AVX macro effectively results in AVX kernels.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
  #if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
  #ifndef GMX_X86_SSE4_1
      "SSE2", "SSE2",
@@ -101,11 +107,17 @@ static const char *nbnxn_kernel_name[nbnxnkNR] =
  #else
      "AVX-128", "AVX-128",
  #endif
-#else
-    "AVX-256",  "AVX-256",
+#endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+    "AVX-256", "AVX-256",
+#endif
+#else /* not GMX_X86_SSE2 */
+    "SIMD", "SIMD",
  #endif
  #endif
-    "CUDA", "plain C" };
+    "CUDA",
+    "plain C"
+};
  
  enum { ewaldexclTable, ewaldexclAnalytical };
  
diff --git a/include/types/nbnxn_pairlist.h b/include/types/nbnxn_pairlist.h

index 4d337cf1a3f4a49260a2b6becafa5b6724b62df5..ae19d7b7d8f5cfdd20f3beaf2ef366fa04d09c7a 100644 (file)
--- a/include/types/nbnxn_pairlist.h
+++ b/include/types/nbnxn_pairlist.h
@@ -232,6 +232,8 @@ typedef struct {
      int  xstride;    /* stride for a coordinate in x (usually 3 or 4)      */
      int  fstride;    /* stride for a coordinate in f (usually 3 or 4)      */
      real *x;         /* x and possibly q, size natoms*xstride              */
+    real *simd_4xn_diag;  /* indices to set the SIMD 4xN diagonal masks    */
+    real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
      int  nout;       /* The number of force arrays                         */
      nbnxn_atomdata_output_t *out;  /* Output data structures               */
      int  nalloc;     /* Allocation size of all arrays (for x/f *x/fstride) */
diff --git a/src/mdlib/nbnxn_atomdata.c b/src/mdlib/nbnxn_atomdata.c

index a11f764914917d3c6019c117815c1567a7f14f95..0f32db1c7c723065c8a95a9533de1713c5db9b9e 100644 (file)
--- a/src/mdlib/nbnxn_atomdata.c
+++ b/src/mdlib/nbnxn_atomdata.c
@@ -51,12 +51,10 @@
  #include "nbnxn_atomdata.h"
  #include "gmx_omp_nthreads.h"
  
-/* Default nbnxn allocation routine, allocates 32 byte aligned,
- * which works for plain C and aligned SSE and AVX loads/stores.
- */
+/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
  void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
  {
-    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
+    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,NBNXN_MEM_ALIGN);
  }
  
  /* Free function for memory allocated with nbnxn_alloc_aligned */
@@ -656,6 +654,38 @@ void nbnxn_atomdata_init(FILE *fp,
      nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
      nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
      nbat->x       = NULL;
+
+#ifdef GMX_NBNXN_SIMD
+    if (simple)
+    {
+        /* Set the diagonal cluster pair exclusion mask setup data.
+         * In the kernel we check 0 < j - i to generate the masks.
+         * Here we store j - i for generating the mask for the first i,
+         * we substract 0.5 to avoid rounding issues.
+         * In the kernel we can subtract 1 to generate the subsequent mask.
+         */
+        const int simd_width=GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+        int simd_4xn_diag_size,j;
+
+        simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE,simd_width);
+        snew_aligned(nbat->simd_4xn_diag,simd_4xn_diag_size,NBNXN_MEM_ALIGN);
+        for(j=0; j<simd_4xn_diag_size; j++)
+        {
+            nbat->simd_4xn_diag[j] = j - 0.5;
+        }
+
+        snew_aligned(nbat->simd_2xnn_diag,simd_width,NBNXN_MEM_ALIGN);
+        for(j=0; j<simd_width/2; j++)
+        {
+            /* The j-cluster size is half the SIMD width */
+            nbat->simd_2xnn_diag[j]              = j - 0.5;
+            /* The next half of the SIMD width is for i + 1 */
+            nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
+        }
+    }
+#endif
+
+    /* Initialize the output data structures */
      nbat->nout    = nout;
      snew(nbat->out,nbat->nout);
      nbat->nalloc  = 0;
@@ -1027,21 +1057,22 @@ nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
  }
  
  static void
-nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
-                                     gmx_bool bDestSet,
-                                     real ** gmx_restrict src,
-                                     int nsrc,
-                                     int i0, int i1)
+nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+                                 gmx_bool bDestSet,
+                                 real ** gmx_restrict src,
+                                 int nsrc,
+                                 int i0, int i1)
  {
-#ifdef NBNXN_SEARCH_SSE
-/* We can use AVX256 here, but not when AVX128 kernels are selected.
- * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
+#ifdef GMX_NBNXN_SIMD
+/* The SIMD width here is actually independent of that in the kernels,
+ * but we use the same width for simplicity (usually optimal anyhow).
   */
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
  #define GMX_MM128_HERE
  #endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#endif
  #include "gmx_simd_macros.h"
  
      int       i,s;
@@ -1258,8 +1289,8 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
                  }
                  if (nfptr > 0)
                  {
-#ifdef NBNXN_SEARCH_SSE
-                    nbnxn_atomdata_reduce_reals_x86_simd
+#ifdef GMX_NBNXN_SIMD
+                    nbnxn_atomdata_reduce_reals_simd
  #else
                      nbnxn_atomdata_reduce_reals
  #endif
diff --git a/src/mdlib/nbnxn_consts.h b/src/mdlib/nbnxn_consts.h

index 8af09084b4b584da8d494c5785c1156e0771a867..31212bbcacb029791e02ea88973db55a1e63a0a3 100644 (file)
--- a/src/mdlib/nbnxn_consts.h
+++ b/src/mdlib/nbnxn_consts.h
@@ -65,8 +65,11 @@ extern "C" {
  /* With GPU kernels the cluster size is 8 atoms */
  #define NBNXN_GPU_CLUSTER_SIZE         8
  
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage */
-#define NBNXN_GPU_JGROUP_SIZE  4
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ */
+#define NBNXN_GPU_JGROUP_SIZE       4
+#define NBNXN_GPU_JGROUP_SIZE_2LOG  2
  
  /* To avoid NaN when excluded atoms are at zero distance, we add a small
   * number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 96c03019a00af64e046139d5a5a122a4b4fb57d2..df19e73adf465d664244c5dd40c723714cac8fd3 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -230,9 +230,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
  #pragma unroll 4
  #endif
-            for (jm = 0; jm < 4; jm++)
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
              {
-                if (imask & (255U << (jm * NCL_PER_SUPERCL)))
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
                  {
                      mask_ji = (1U << (jm * NCL_PER_SUPERCL));
  
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh

index 93baef62600bc7f7f5354ace99c6db9b0d2665e1..f661abec2b828a2c88a2ea78fc2b381d6ed7857d 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -209,9 +209,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  #if CUDA_VERSION >= 4010
              #pragma unroll 4
  #endif
-            for (jm = 0; jm < 4; jm++)
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
              {
-                imask_j = (imask >> (jm * 8)) & 255U;
+                imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
                  if (imask_j)
                  {
                      nsubi = __popc(imask_j);
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 9d49d9c85dd706af61e41fb897ad5dca51714879..9b900f95c940435fd0e2d0869f845bbbabf4618d 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -51,6 +51,9 @@
  #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  #define FBUF_STRIDE                 (CL_SIZE_SQ)
  
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
  /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
   *  Original idea: OpenMM
   */
diff --git a/src/mdlib/nbnxn_internal.h b/src/mdlib/nbnxn_internal.h

index 1a7f5bfcea634b6722f5851c9198f2b9f0a692d7..a9c74897a1e6abae6db194aa3ab3264b055ba4bc 100644 (file)
--- a/src/mdlib/nbnxn_internal.h
+++ b/src/mdlib/nbnxn_internal.h
@@ -49,7 +49,17 @@ extern "C" {
  
  
  #ifdef GMX_X86_SSE2
-#define NBNXN_SEARCH_SSE
+/* Use 4-way SIMD for, always, single precision bounding box calculations */
+#define NBNXN_SEARCH_BB_SSE
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_MEM_ALIGN  (GMX_NBNXN_SIMD_BITWIDTH/8)
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_MEM_ALIGN  32
  #endif
  
  
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c

index 3532f6f8e7a87a27cf986524f0ecf62be390f3c4..dc2e522e6d6f96ac7ac5fef9658b0dc092f05169 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
@@ -188,7 +188,7 @@ nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
              excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
              excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
  
-            for(jm=0; jm<4; jm++)
+            for(jm=0; jm<NBNXN_GPU_JGROUP_SIZE; jm++)
              {
                  cj               = nbl->cj4[cj4_ind].cj[jm];
  
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h

index f656e4d6dd05eba0c8023d140866291090052cc0..f43c9e5eac877ef5ec67ffca6559ef2889d38370 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
@@ -188,12 +188,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
      gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
      gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
  
-    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+    gmx_mm_pr diag_SSE0,diag_SSE2;
+#else
+    gmx_mm_pr diag0_SSE0,diag0_SSE2;
+    gmx_mm_pr diag1_SSE0,diag1_SSE2;
+#endif
  
-#ifdef GMX_X86_SSE4_1
      gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
-#endif
  
      gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
      gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
@@ -290,6 +293,29 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
      nbfp_stride = NBFP_STRIDE;
  #endif
  
+    /* Load j-i for the first i */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
+    /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+    diag_SSE0    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE2    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if 2*UNROLLI == UNROLLJ
+    diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+    diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+#endif
+#endif
+
  #ifdef CALC_COUL_TAB
  #ifdef GMX_MM256_HERE
      /* Generate aligned table index pointers */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h

index 1676f1f43dc38c85901a37d8744d2afe45bdfee7..602922c9457fc77596061eb896a2f7035846d95a 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
@@ -286,18 +286,18 @@
              ajz           = ajy + STRIDE;
  
  #ifdef CHECK_EXCLS
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
              {
                  /* Load integer interaction mask */
                  __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
  
-                /* The is no unequal sse instruction, so we need a not here */
                  int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
                  int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
                  int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
                  int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
              }
-#else
+#endif
+#if defined GMX_X86_SSE2 && defined GMX_MM256_HERE
              {
  #ifndef GMX_DOUBLE
                  /* Load integer interaction mask */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

index ee6e0051f1b7e42bc02118e58ecce16ac444a902..db8602c7c668ac393ed077bd1b1cb919c6e72618 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
@@ -231,47 +231,18 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
  #endif
  #endif
  
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-    __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
-    __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#endif
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-    gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+    gmx_mm_pr diag_SSE0,diag_SSE1,diag_SSE2,diag_SSE3;
  #else
-    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
+    gmx_mm_pr diag0_SSE0,diag0_SSE1,diag0_SSE2,diag0_SSE3;
+    gmx_mm_pr diag1_SSE0,diag1_SSE1,diag1_SSE2,diag1_SSE3;
  #endif
  
-#ifdef GMX_MM128_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
      __m128i    zeroi_SSE = _mm_setzero_si128();
  #endif
-#ifdef GMX_X86_SSE4_1
      gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
-#endif
  
      gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
      gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
@@ -376,6 +347,43 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      nbfp_stride = NBFP_STRIDE;
  #endif
  
+    /* Load j-i for the first i */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+    /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+    diag_SSE0    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE1    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE2    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag_SSE3    = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
+    diag0_SSE0   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE1   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE2   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag0_SSE3   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+
+#if UNROLLI == 2*UNROLLJ
+    /* Load j-i for the second half of the j-cluster */
+    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+#endif
+
+    diag1_SSE0   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE1   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE2   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+    diag1_SSE3   = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#endif
+#endif
+
  #ifdef CALC_COUL_TAB
  #ifdef GMX_MM256_HERE
      /* Generate aligned table index pointers */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index dc7112e7a3e9d74b6a6a7f0595413234e12e9f55..4e904dc61d7687ea18c1df9d8a1f63e589709fee 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -37,15 +37,16 @@
  #ifndef _nbnxn_kernel_sse_utils_h_
  #define _nbnxn_kernel_sse_utils_h_
  
-/* This files contains all functions/macros for the SSE/AVX kernels
- * which have explicit dependencies on the j-size / SIMD-width, which
- * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
   * The functionality which depends on the j-cluster size is:
   *   LJ-parameter lookup
   *   force table lookup
   *   energy group pair energy storage
   */
  
+#ifdef GMX_X86_SSE2
+
  /* Transpose 2 double precision registers */
  #define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1)                      \
  {                                                                       \
@@ -566,4 +567,6 @@ static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
  }
  #endif
  
+#endif /* GMX_X86_SSE2 */
+
  #endif /* _nbnxn_kernel_sse_utils_h_ */
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c

index 6b39abd7818db0c19854936c306fc6de35914423..000f0806bf0f14f39601aad215c8a35f07553e4d 100644 (file)
--- a/src/mdlib/nbnxn_search.c
+++ b/src/mdlib/nbnxn_search.c
@@ -75,26 +75,29 @@
  #define BBU_Z  6
  
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
  
  #ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
  #define NBNXN_SEARCH_SSE_SINGLE
  #endif
  
  /* Include basic SSE2 stuff */
  #include <emmintrin.h>
  
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
  #endif
  
  /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
   * Here AVX-256 turns out to be slightly slower than AVX-128.
   */
-#define STRIDE_8BB        4
-#define STRIDE_8BB_2LOG   2
+#define STRIDE_PBB        4
+#define STRIDE_PBB_2LOG   2
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  #ifdef GMX_NBNXN_SIMD
  
@@ -170,11 +173,11 @@
  #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
  
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
  #define NBNXN_BBXXXX
  /* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
  #endif
  
  /* We shift the i-particles backward for PBC.
@@ -521,8 +524,8 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
          grid->nc_nalloc = over_alloc_large(nc_max);
          srenew(grid->nsubc,grid->nc_nalloc);
          srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
-        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
  #else
          bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
  #endif
@@ -787,7 +790,7 @@ static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
      bb[BBU_Z] = R2F_U(zh);
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* Packed coordinates, bb order xyz0 */
  static void calc_bounding_box_x_x4_halves(int na,const real *x,
@@ -839,15 +842,15 @@ static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
          i += stride;
      }
      /* Note: possible double to float conversion here */
-    bb[0*STRIDE_8BB] = R2F_D(xl);
-    bb[1*STRIDE_8BB] = R2F_D(yl);
-    bb[2*STRIDE_8BB] = R2F_D(zl);
-    bb[3*STRIDE_8BB] = R2F_U(xh);
-    bb[4*STRIDE_8BB] = R2F_U(yh);
-    bb[5*STRIDE_8BB] = R2F_U(zh);
+    bb[0*STRIDE_PBB] = R2F_D(xl);
+    bb[1*STRIDE_PBB] = R2F_D(yl);
+    bb[2*STRIDE_PBB] = R2F_D(zl);
+    bb[3*STRIDE_PBB] = R2F_U(xh);
+    bb[4*STRIDE_PBB] = R2F_U(yh);
+    bb[5*STRIDE_PBB] = R2F_U(zh);
  }
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  #ifdef NBNXN_SEARCH_SSE_SINGLE
  
@@ -880,17 +883,17 @@ static void calc_bounding_box_xxxx_sse(int na,const float *x,
  {
      calc_bounding_box_sse(na,x,bb_work);
  
-    bb[0*STRIDE_8BB] = bb_work[BBL_X];
-    bb[1*STRIDE_8BB] = bb_work[BBL_Y];
-    bb[2*STRIDE_8BB] = bb_work[BBL_Z];
-    bb[3*STRIDE_8BB] = bb_work[BBU_X];
-    bb[4*STRIDE_8BB] = bb_work[BBU_Y];
-    bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+    bb[0*STRIDE_PBB] = bb_work[BBL_X];
+    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+    bb[3*STRIDE_PBB] = bb_work[BBU_X];
+    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
  }
  
  #endif /* NBNXN_SEARCH_SSE_SINGLE */
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* Combines pairs of consecutive bounding boxes */
  static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
@@ -969,18 +972,18 @@ static void print_bbsizes_supersub(FILE *fp,
      for(c=0; c<grid->nc; c++)
      {
  #ifdef NBNXN_BBXXXX
-        for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+        for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
          {
              int cs_w,i,d;
  
-            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
-            for(i=0; i<STRIDE_8BB; i++)
+            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+            for(i=0; i<STRIDE_PBB; i++)
              {
                  for(d=0; d<DIM; d++)
                  {
                      ba[d] +=
-                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
-                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
+                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
                  }
              }
          }
@@ -1124,7 +1127,7 @@ void fill_cell(const nbnxn_search_t nbs,
          offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
          bb_ptr = grid->bb + offset;
  
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
          if (2*grid->na_cj == grid->na_c)
          {
              calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
@@ -1152,8 +1155,8 @@ void fill_cell(const nbnxn_search_t nbs,
                               */
          bb_ptr =
              grid->bb +
-            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
-            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
  
  #ifdef NBNXN_SEARCH_SSE_SINGLE
          if (nbat->XFormat == nbatXYZQ)
@@ -1171,9 +1174,9 @@ void fill_cell(const nbnxn_search_t nbs,
          {
              fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
                      sx,sy,sz,
-                    bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
-                    bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
-                    bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+                    bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+                    bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+                    bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
          }
      }
  #endif
@@ -1627,7 +1630,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
          }
      }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      if (grid->bSimple && nbat->XFormat == nbatX8)
      {
          combine_bounding_box_pairs(grid,grid->bb);
@@ -1901,7 +1904,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t nbs,
          }
      }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      if (grid->bSimple && nbat->XFormat == nbatX8)
      {
          combine_bounding_box_pairs(grid,grid->bb_simple);
@@ -2039,7 +2042,7 @@ static float subc_bb_dist2(int si,const float *bb_i_ci,
      return d2;
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* SSE code for bb distance for bb format xyz0 */
  static float subc_bb_dist2_sse(int na_c,
@@ -2108,12 +2111,12 @@ static float subc_bb_dist2_sse(int na_c,
                                                   \
      shi = si*NNBSBB_D*DIM;                       \
                                                   \
-    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
-    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
-    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
-    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
-    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
-    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
+    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
+    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
+    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
+    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
+    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
+    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
                                                   \
      dx_0 = _mm_sub_ps(xi_l,xj_h);                \
      dy_0 = _mm_sub_ps(yi_l,yj_h);                \
@@ -2155,24 +2158,24 @@ static void subc_bb_dist2_sse_xxxx(const float *bb_j,
  
      zero = _mm_setzero_ps();
  
-    xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
-    yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
-    zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
-    xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
-    yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
-    zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
  
-    /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
       * But as we know the number of iterations is 1 or 2, we unroll manually.
       */
      SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
-    if (STRIDE_8BB < nsi)
+    if (STRIDE_PBB < nsi)
      {
-        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
      }
  }
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  /* Plain C function which determines if any atom pair between two cells
   * is within distance sqrt(rl2).
@@ -2225,13 +2228,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
  
      rc2_SSE   = _mm_set1_ps(rl2);
  
-    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
-    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
-    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
-    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
-    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
-    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
-    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
  
      /* We loop from the outer to the inner particles to maximize
       * the chance that we find a pair in range quickly and return.
@@ -2317,13 +2320,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
  /* Returns the j sub-cell for index cj_ind */
  static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
  {
-    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
  }
  
  /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
  static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
  {
-    return nbl->cj4[cj_ind>>2].imei[0].imask;
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
  }
  
  /* Ensures there is enough space for extra extra exclusion masks */
@@ -2370,7 +2373,7 @@ static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
      /* We can store 4 j-subcell - i-supercell pairs in one struct.
       * since we round down, we need one extra entry.
       */
-    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
  
      if (ncj4_max > nbl->cj4_nalloc)
      {
@@ -2460,16 +2463,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
  
      snew(nbl->work,1);
  #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
  #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
  #endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
  #ifdef GMX_NBNXN_SIMD
-    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
-    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
  #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
  }
  
  void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2585,7 +2588,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
      fprintf(fp,"nbl average j super cell list length %.1f\n",
              0.25*nbl->ncj4/(double)nbl->nsci);
      fprintf(fp,"nbl average i sub cell list length %.1f\n",
-            nbl->nci_tot/(0.25*nbl->ncj4));
+            nbl->nci_tot/((double)nbl->ncj4));
  
      for(si=0; si<=GPU_NSUBCELL; si++)
      {
@@ -2595,7 +2598,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
      {
          for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
          {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
              {
                  b = 0;
                  for(si=0; si<GPU_NSUBCELL; si++)
@@ -2712,8 +2715,8 @@ static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
          w = (ej>>2);
          for(ei=ej; ei<nbl->na_ci; ei++)
          {
-            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
-                ~(1U << (sj_offset*GPU_NSUBCELL+si));
+            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+                ~(1U << (sj_offset*GPU_NSUBCELL + si));
          }
      }
  }
@@ -2924,7 +2927,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
  
      for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
      {
-        cj4_ind   = (nbl->work->cj_ind >> 2);
+        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
          cj4       = &nbl->cj4[cj4_ind];
  
@@ -2947,7 +2950,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
  
  #ifdef NBNXN_BBXXXX
          /* Determine all ci1 bb distances in one call with SSE */
-        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
                                 ci1,bb_ci,d2l);
          *ndistc += na_c*2;
  #endif
@@ -3003,7 +3006,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
          {
              /* Avoid using function pointers here, as it's slower */
              if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
                  !subc_in_range_sse8
  #else
                  !subc_in_range_x
@@ -3041,7 +3044,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
              nbl->nci_tot += npair;
  
              /* Increase the closing index in i super-cell list */
-            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+            nbl->sci[nbl->nsci].cj4_ind_end =
+                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          }
      }
  }
@@ -3099,7 +3103,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
              ndirect++;
          }
      }
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      else
      {
          while (cj_ind_first + ndirect <= cj_ind_last &&
@@ -3313,22 +3317,25 @@ static void set_sci_top_excls(const nbnxn_search_t nbs,
                          inner_e = ge - se*na_c;
  
  /* Macro for getting the index of atom a within a cluster */
-#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
  /* Macro for converting an atom number to a cluster number */
-#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
  
-                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+                        if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
                          {
                              w       = (inner_e >> 2);
  
-                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+                            get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
  
-                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
-                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
                          }
  
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
                      }
                  }
              }
@@ -3571,7 +3578,7 @@ static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
          /* We can only have complete blocks of 4 j-entries in a list,
           * so round the count up before closing.
           */
-        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
+        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
  
          nbl->nsci++;
@@ -3631,17 +3638,17 @@ static void set_icell_bb_supersub(const float *bb,int ci,
      int ia,m,i;
  
  #ifdef NBNXN_BBXXXX
-    ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
-    for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+    for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
      {
-        for(i=0; i<STRIDE_8BB; i++)
+        for(i=0; i<STRIDE_PBB; i++)
          {
-            bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
-            bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
-            bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
-            bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
-            bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
-            bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
          }
      }
  #else
@@ -3698,7 +3705,7 @@ static void icell_set_x_supersub(int ci,
      }
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  /* Copies PBC shifted super-cell packed atom coordinates to working array */
  static void icell_set_x_supersub_sse8(int ci,
                                        real shx,real shy,real shz,
@@ -3713,15 +3720,15 @@ static void icell_set_x_supersub_sse8(int ci,
  
      for(si=0; si<GPU_NSUBCELL; si++)
      {
-        for(i=0; i<na_c; i+=STRIDE_8BB)
+        for(i=0; i<na_c; i+=STRIDE_PBB)
          {
              io = si*na_c + i;
              ia = ci*GPU_NSUBCELL*na_c + io;
-            for(j=0; j<STRIDE_8BB; j++)
+            for(j=0; j<STRIDE_PBB; j++)
              {
-                x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
-                x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
-                x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
              }
          }
      }
@@ -3909,7 +3916,7 @@ static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
  
          for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
          {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
              {
                  fprintf(fp,"  sj %5d  imask %x\n",
                          nbl->cj4[j4].cj[j],
@@ -4844,7 +4851,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
      }
      else
      {
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
          nbs->icell_set_x = icell_set_x_supersub_sse8;
  #else
          nbs->icell_set_x = icell_set_x_supersub;
author	Berk Hess <hess@kth.se>
	Tue, 8 Jan 2013 10:46:40 +0000 (11:46 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Sat, 12 Jan 2013 21:32:41 +0000 (22:32 +0100)
include/gmx_simd_macros.h		patch \| blob \| history
include/types/nb_verlet.h		patch \| blob \| history
include/types/nbnxn_pairlist.h		patch \| blob \| history
src/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/mdlib/nbnxn_consts.h		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history
src/mdlib/nbnxn_internal.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h		patch \| blob \| history
src/mdlib/nbnxn_search.c		patch \| blob \| history