Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.c
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index c0f08bd5614733664b035f7cce225a61265f2cac..5b16fc06a7326e90b870e86a9aee0ee89260229b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -69,26 +69,29 @@
  #define BBU_Z  6
  
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
  
  #ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
  #define NBNXN_SEARCH_SSE_SINGLE
  #endif
  
  /* Include basic SSE2 stuff */
  #include <emmintrin.h>
  
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
  #endif
  
  /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
   * Here AVX-256 turns out to be slightly slower than AVX-128.
   */
-#define STRIDE_8BB        4
-#define STRIDE_8BB_2LOG   2
+#define STRIDE_PBB        4
+#define STRIDE_PBB_2LOG   2
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  #ifdef GMX_NBNXN_SIMD
  
@@ -164,11 +167,11 @@
  #define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
  
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
  #define NBNXN_BBXXXX
  /* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
  #endif
  
  /* We shift the i-particles backward for PBC.
@@ -418,6 +421,7 @@ static real grid_atom_density(int n,rvec corner0,rvec corner1)
  
  static int set_grid_size_xy(const nbnxn_search_t nbs,
                              nbnxn_grid_t *grid,
+                            int dd_zone,
                              int n,rvec corner0,rvec corner1,
                              real atom_density,
                              int XFormat)
@@ -464,6 +468,23 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
          grid->ncy = 1;
      }
  
+    grid->sx = size[XX]/grid->ncx;
+    grid->sy = size[YY]/grid->ncy;
+    grid->inv_sx = 1/grid->sx;
+    grid->inv_sy = 1/grid->sy;
+
+    if (dd_zone > 0)
+    {
+        /* This is a non-home zone, add an extra row of cells
+         * for particles communicated for bonded interactions.
+         * These can be beyond the cut-off. It doesn't matter where
+         * they end up on the grid, but for performance it's better
+         * if they don't end up in cells that can be within cut-off range.
+         */
+        grid->ncx++;
+        grid->ncy++;
+    }
+
      /* We need one additional cell entry for particles moved by DD */
      if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
      {
@@ -497,8 +518,8 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
          grid->nc_nalloc = over_alloc_large(nc_max);
          srenew(grid->nsubc,grid->nc_nalloc);
          srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
-        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
  #else
          bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
  #endif
@@ -526,23 +547,39 @@ static int set_grid_size_xy(const nbnxn_search_t nbs,
  
      copy_rvec(corner0,grid->c0);
      copy_rvec(corner1,grid->c1);
-    grid->sx = size[XX]/grid->ncx;
-    grid->sy = size[YY]/grid->ncy;
-    grid->inv_sx = 1/grid->sx;
-    grid->inv_sy = 1/grid->sy;
  
      return nc_max;
  }
  
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
  #define SGSF (SORT_GRID_OVERSIZE + 1)
  
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
  static void sort_atoms(int dim,gmx_bool Backwards,
                         int *a,int n,rvec *x,
                         real h0,real invh,int nsort,int *sort)
  {
      int i,c;
-    int zi,zim;
+    int zi,zim,zi_min,zi_max;
      int cp,tmp;
  
      if (n <= 1)
@@ -551,13 +588,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
          return;
      }
  
-    /* For small oversize factors clearing the whole area is fastest.
-     * For large oversize we should clear the used elements after use.
-     */
-    for(i=0; i<nsort; i++)
-    {
-        sort[i] = -1;
-    }
+    /* Determine the index range used, so we can limit it for the second pass */
+    zi_min = INT_MAX;
+    zi_max = -1;
+
      /* Sort the particles using a simple index sort */
      for(i=0; i<n; i++)
      {
@@ -582,6 +616,8 @@ static void sort_atoms(int dim,gmx_bool Backwards,
          if (sort[zi] < 0)
          {
              sort[zi] = a[i];
+            zi_min = min(zi_min,zi);
+            zi_max = max(zi_max,zi);
          }
          else
          {
@@ -611,8 +647,10 @@ static void sort_atoms(int dim,gmx_bool Backwards,
                      zim++;
                  }
                  sort[zim] = cp;
+                zi_max = max(zi_max,zim);
              }
              sort[zi] = a[i];
+            zi_max = max(zi_max,zi);
          }
      }
  
@@ -624,16 +662,18 @@ static void sort_atoms(int dim,gmx_bool Backwards,
              if (sort[zi] >= 0)
              {
                  a[c++] = sort[zi];
+                sort[zi] = -1;
              }
          }
      }
      else
      {
-        for(zi=nsort-1; zi>=0; zi--)
+        for(zi=zi_max; zi>=zi_min; zi--)
          {
              if (sort[zi] >= 0)
              {
                  a[c++] = sort[zi];
+                sort[zi] = -1;
              }
          }
      }
@@ -744,7 +784,7 @@ static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
      bb[BBU_Z] = R2F_U(zh);
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* Packed coordinates, bb order xyz0 */
  static void calc_bounding_box_x_x4_halves(int na,const real *x,
@@ -796,15 +836,15 @@ static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
          i += stride;
      }
      /* Note: possible double to float conversion here */
-    bb[0*STRIDE_8BB] = R2F_D(xl);
-    bb[1*STRIDE_8BB] = R2F_D(yl);
-    bb[2*STRIDE_8BB] = R2F_D(zl);
-    bb[3*STRIDE_8BB] = R2F_U(xh);
-    bb[4*STRIDE_8BB] = R2F_U(yh);
-    bb[5*STRIDE_8BB] = R2F_U(zh);
+    bb[0*STRIDE_PBB] = R2F_D(xl);
+    bb[1*STRIDE_PBB] = R2F_D(yl);
+    bb[2*STRIDE_PBB] = R2F_D(zl);
+    bb[3*STRIDE_PBB] = R2F_U(xh);
+    bb[4*STRIDE_PBB] = R2F_U(yh);
+    bb[5*STRIDE_PBB] = R2F_U(zh);
  }
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  #ifdef NBNXN_SEARCH_SSE_SINGLE
  
@@ -837,17 +877,17 @@ static void calc_bounding_box_xxxx_sse(int na,const float *x,
  {
      calc_bounding_box_sse(na,x,bb_work);
  
-    bb[0*STRIDE_8BB] = bb_work[BBL_X];
-    bb[1*STRIDE_8BB] = bb_work[BBL_Y];
-    bb[2*STRIDE_8BB] = bb_work[BBL_Z];
-    bb[3*STRIDE_8BB] = bb_work[BBU_X];
-    bb[4*STRIDE_8BB] = bb_work[BBU_Y];
-    bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+    bb[0*STRIDE_PBB] = bb_work[BBL_X];
+    bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+    bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+    bb[3*STRIDE_PBB] = bb_work[BBU_X];
+    bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+    bb[5*STRIDE_PBB] = bb_work[BBU_Z];
  }
  
  #endif /* NBNXN_SEARCH_SSE_SINGLE */
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* Combines pairs of consecutive bounding boxes */
  static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
@@ -926,18 +966,18 @@ static void print_bbsizes_supersub(FILE *fp,
      for(c=0; c<grid->nc; c++)
      {
  #ifdef NBNXN_BBXXXX
-        for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+        for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
          {
              int cs_w,i,d;
  
-            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
-            for(i=0; i<STRIDE_8BB; i++)
+            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+            for(i=0; i<STRIDE_PBB; i++)
              {
                  for(d=0; d<DIM; d++)
                  {
                      ba[d] +=
-                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
-                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
+                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
                  }
              }
          }
@@ -1081,7 +1121,7 @@ void fill_cell(const nbnxn_search_t nbs,
          offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
          bb_ptr = grid->bb + offset;
  
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
          if (2*grid->na_cj == grid->na_c)
          {
              calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
@@ -1109,8 +1149,8 @@ void fill_cell(const nbnxn_search_t nbs,
                               */
          bb_ptr =
              grid->bb +
-            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
-            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
  
  #ifdef NBNXN_SEARCH_SSE_SINGLE
          if (nbat->XFormat == nbatXYZQ)
@@ -1128,9 +1168,9 @@ void fill_cell(const nbnxn_search_t nbs,
          {
              fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
                      sx,sy,sz,
-                    bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
-                    bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
-                    bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+                    bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+                    bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+                    bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
          }
      }
  #endif
@@ -1353,7 +1393,8 @@ static void sort_columns_supersub(const nbnxn_search_t nbs,
  /* Determine in which grid column atoms should go */
  static void calc_column_indices(nbnxn_grid_t *grid,
                                  int a0,int a1,
-                                rvec *x,const int *move,
+                                rvec *x,
+                                int dd_zone,const int *move,
                                  int thread,int nthread,
                                  int *cell,
                                  int *cxy_na)
@@ -1369,50 +1410,78 @@ static void calc_column_indices(nbnxn_grid_t *grid,
  
      n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
      n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
      {
-        if (move == NULL || move[i] >= 0)
+        /* Home zone */
+        for(i=n0; i<n1; i++)
          {
-            /* We need to be careful with rounding,
-             * particles might be a few bits outside the local box.
-             * The int cast takes care of the lower bound,
-             * we need to explicitly take care of the upper bound.
-             */
-            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
-            if (cx == grid->ncx)
-            {
-                cx = grid->ncx - 1;
-            }
-            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
-            if (cy == grid->ncy)
+            if (move == NULL || move[i] >= 0)
              {
-                cy = grid->ncy - 1;
-            }
-            /* For the moment cell contains only the, grid local,
-             * x and y indices, not z.
-             */
-            cell[i] = cx*grid->ncy + cy;
+                /* We need to be careful with rounding,
+                 * particles might be a few bits outside the local zone.
+                 * The int cast takes care of the lower bound,
+                 * we will explicitly take care of the upper bound.
+                 */
+                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
  
  #ifdef DEBUG_NBNXN_GRIDDING
-            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+                if (cx < 0 || cx >= grid->ncx ||
+                    cy < 0 || cy >= grid->ncy)
+                {
+                    gmx_fatal(FARGS,
+                              "grid cell cx %d cy %d out of range (max %d %d)\n"
+                              "atom %f %f %f, grid->c0 %f %f",
+                              cx,cy,grid->ncx,grid->ncy,
+                              x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                }
+#endif
+                /* Take care of potential rouding issues */
+                cx = min(cx,grid->ncx - 1);
+                cy = min(cy,grid->ncy - 1);
+
+                /* For the moment cell will contain only the, grid local,
+                 * x and y indices, not z.
+                 */
+                cell[i] = cx*grid->ncy + cy;
+            }
+            else
              {
-                gmx_fatal(FARGS,
-                          "grid cell cx %d cy %d out of range (max %d %d)\n"
-                          "atom %f %f %f, grid->c0 %f %f",
-                          cx,cy,grid->ncx,grid->ncy,
-                          x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+                /* Put this moved particle after the end of the grid,
+                 * so we can process it later without using conditionals.
+                 */
+                cell[i] = grid->ncx*grid->ncy;
              }
-#endif
+
+            cxy_na[cell[i]]++;
          }
-        else
+    }
+    else
+    {
+        /* Non-home zone */
+        for(i=n0; i<n1; i++)
          {
-            /* Put this moved particle after the end of the grid,
-             * so we can process it later without using conditionals.
+            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+            /* For non-home zones there could be particles outside
+             * the non-bonded cut-off range, which have been communicated
+             * for bonded interactions only. For the result it doesn't
+             * matter where these end up on the grid. For performance
+             * we put them in an extra row at the border.
               */
-            cell[i] = grid->ncx*grid->ncy;
-        }
+            cx = max(cx,0);
+            cx = min(cx,grid->ncx - 1);
+            cy = max(cy,0);
+            cy = min(cy,grid->ncy - 1);
  
-        cxy_na[cell[i]]++;
+            /* For the moment cell will contain only the, grid local,
+             * x and y indices, not z.
+             */
+            cell[i] = cx*grid->ncy + cy;
+
+            cxy_na[cell[i]]++;
+        }
      }
  }
  
@@ -1436,7 +1505,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
  #pragma omp parallel for num_threads(nthread) schedule(static)
      for(thread=0; thread<nthread; thread++)
      {
-        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+        calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
                              nbs->cell,nbs->work[thread].cxy_na);
      }
  
@@ -1503,6 +1572,11 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
                  over_alloc_large(ncz_max*grid->na_sc*SGSF);
              srenew(nbs->work[thread].sort_work,
                     nbs->work[thread].sort_work_nalloc);
+            /* When not in use, all elements should be -1 */
+            for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+            {
+                nbs->work[thread].sort_work[i] = -1;
+            }
          }
      }
  
@@ -1516,12 +1590,18 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
          nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
      }
  
-    /* Set the cell indices for the moved particles */
-    n0 = grid->nc*grid->na_sc;
-    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
-    for(i=n0; i<n1; i++)
+    if (dd_zone == 0)
      {
-        nbs->cell[nbs->a[i]] = i;
+        /* Set the cell indices for the moved particles */
+        n0 = grid->nc*grid->na_sc;
+        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+        if (dd_zone == 0)
+        {
+            for(i=n0; i<n1; i++)
+            {
+                nbs->cell[nbs->a[i]] = i;
+            }
+        }
      }
  
      /* Sort the super-cell columns along z into the sub-cells. */
@@ -1544,7 +1624,7 @@ static void calc_cell_indices(const nbnxn_search_t nbs,
          }
      }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      if (grid->bSimple && nbat->XFormat == nbatX8)
      {
          combine_bounding_box_pairs(grid,grid->bb);
@@ -1666,7 +1746,8 @@ void nbnxn_put_on_grid(nbnxn_search_t nbs,
          nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
      }
  
-    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+    nc_max_grid = set_grid_size_xy(nbs,grid,
+                                   dd_zone,n-nmoved,corner0,corner1,
                                     nbs->grid[0].atom_density,
                                     nbat->XFormat);
  
@@ -1817,7 +1898,7 @@ void nbnxn_grid_add_simple(nbnxn_search_t nbs,
          }
      }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      if (grid->bSimple && nbat->XFormat == nbatX8)
      {
          combine_bounding_box_pairs(grid,grid->bb_simple);
@@ -1955,7 +2036,7 @@ static float subc_bb_dist2(int si,const float *bb_i_ci,
      return d2;
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  
  /* SSE code for bb distance for bb format xyz0 */
  static float subc_bb_dist2_sse(int na_c,
@@ -2024,12 +2105,12 @@ static float subc_bb_dist2_sse(int na_c,
                                                   \
      shi = si*NNBSBB_D*DIM;                       \
                                                   \
-    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
-    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
-    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
-    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
-    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
-    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
+    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
+    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
+    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
+    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
+    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
+    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
                                                   \
      dx_0 = _mm_sub_ps(xi_l,xj_h);                \
      dy_0 = _mm_sub_ps(yi_l,yj_h);                \
@@ -2071,24 +2152,24 @@ static void subc_bb_dist2_sse_xxxx(const float *bb_j,
  
      zero = _mm_setzero_ps();
  
-    xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
-    yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
-    zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
-    xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
-    yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
-    zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
  
-    /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
       * But as we know the number of iterations is 1 or 2, we unroll manually.
       */
      SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
-    if (STRIDE_8BB < nsi)
+    if (STRIDE_PBB < nsi)
      {
-        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
      }
  }
  
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
  
  /* Plain C function which determines if any atom pair between two cells
   * is within distance sqrt(rl2).
@@ -2141,13 +2222,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
  
      rc2_SSE   = _mm_set1_ps(rl2);
  
-    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
-    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
-    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
-    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
-    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
-    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
-    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
  
      /* We loop from the outer to the inner particles to maximize
       * the chance that we find a pair in range quickly and return.
@@ -2233,13 +2314,13 @@ static gmx_bool subc_in_range_sse8(int na_c,
  /* Returns the j sub-cell for index cj_ind */
  static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
  {
-    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
  }
  
  /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
  static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
  {
-    return nbl->cj4[cj_ind>>2].imei[0].imask;
+    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
  }
  
  /* Ensures there is enough space for extra extra exclusion masks */
@@ -2286,7 +2367,7 @@ static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
      /* We can store 4 j-subcell - i-supercell pairs in one struct.
       * since we round down, we need one extra entry.
       */
-    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
  
      if (ncj4_max > nbl->cj4_nalloc)
      {
@@ -2376,16 +2457,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
  
      snew(nbl->work,1);
  #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
  #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
  #endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
  #ifdef GMX_NBNXN_SIMD
-    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
-    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
  #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
  }
  
  void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2501,7 +2582,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
      fprintf(fp,"nbl average j super cell list length %.1f\n",
              0.25*nbl->ncj4/(double)nbl->nsci);
      fprintf(fp,"nbl average i sub cell list length %.1f\n",
-            nbl->nci_tot/(0.25*nbl->ncj4));
+            nbl->nci_tot/((double)nbl->ncj4));
  
      for(si=0; si<=GPU_NSUBCELL; si++)
      {
@@ -2511,7 +2592,7 @@ static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nb
      {
          for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
          {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
              {
                  b = 0;
                  for(si=0; si<GPU_NSUBCELL; si++)
@@ -2628,8 +2709,8 @@ static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
          w = (ej>>2);
          for(ei=ej; ei<nbl->na_ci; ei++)
          {
-            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
-                ~(1U << (sj_offset*GPU_NSUBCELL+si));
+            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+                ~(1U << (sj_offset*GPU_NSUBCELL + si));
          }
      }
  }
@@ -2840,7 +2921,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
  
      for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
      {
-        cj4_ind   = (nbl->work->cj_ind >> 2);
+        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
          cj4       = &nbl->cj4[cj4_ind];
  
@@ -2863,7 +2944,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
  
  #ifdef NBNXN_BBXXXX
          /* Determine all ci1 bb distances in one call with SSE */
-        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
                                 ci1,bb_ci,d2l);
          *ndistc += na_c*2;
  #endif
@@ -2919,7 +3000,7 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
          {
              /* Avoid using function pointers here, as it's slower */
              if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
                  !subc_in_range_sse8
  #else
                  !subc_in_range_x
@@ -2957,7 +3038,8 @@ static void make_cluster_list_supersub(const nbnxn_search_t nbs,
              nbl->nci_tot += npair;
  
              /* Increase the closing index in i super-cell list */
-            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+            nbl->sci[nbl->nsci].cj4_ind_end =
+                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          }
      }
  }
@@ -3015,7 +3097,7 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
              ndirect++;
          }
      }
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
      else
      {
          while (cj_ind_first + ndirect <= cj_ind_last &&
@@ -3229,22 +3311,25 @@ static void set_sci_top_excls(const nbnxn_search_t nbs,
                          inner_e = ge - se*na_c;
  
  /* Macro for getting the index of atom a within a cluster */
-#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
  /* Macro for converting an atom number to a cluster number */
-#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
  
-                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+                        if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
                          {
                              w       = (inner_e >> 2);
  
-                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+                            get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
  
-                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
-                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
                          }
  
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
                      }
                  }
              }
@@ -3356,13 +3441,17 @@ static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
      {
          sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
  
-        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        /* The counts below are used for non-bonded pair/flop counts
+         * and should therefore match the available kernel setups.
+         */
+        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
          {
-            nbl->work->ncj_hlj += jlen;
+            nbl->work->ncj_noq += jlen;
          }
-        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
          {
-            nbl->work->ncj_noq += jlen;
+            nbl->work->ncj_hlj += jlen;
          }
  
          nbl->nci++;
@@ -3483,7 +3572,7 @@ static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
          /* We can only have complete blocks of 4 j-entries in a list,
           * so round the count up before closing.
           */
-        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
+        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
          nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
  
          nbl->nsci++;
@@ -3543,17 +3632,17 @@ static void set_icell_bb_supersub(const float *bb,int ci,
      int ia,m,i;
  
  #ifdef NBNXN_BBXXXX
-    ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
-    for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+    for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
      {
-        for(i=0; i<STRIDE_8BB; i++)
+        for(i=0; i<STRIDE_PBB; i++)
          {
-            bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
-            bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
-            bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
-            bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
-            bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
-            bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
          }
      }
  #else
@@ -3610,7 +3699,7 @@ static void icell_set_x_supersub(int ci,
      }
  }
  
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
  /* Copies PBC shifted super-cell packed atom coordinates to working array */
  static void icell_set_x_supersub_sse8(int ci,
                                        real shx,real shy,real shz,
@@ -3625,15 +3714,15 @@ static void icell_set_x_supersub_sse8(int ci,
  
      for(si=0; si<GPU_NSUBCELL; si++)
      {
-        for(i=0; i<na_c; i+=STRIDE_8BB)
+        for(i=0; i<na_c; i+=STRIDE_PBB)
          {
              io = si*na_c + i;
              ia = ci*GPU_NSUBCELL*na_c + io;
-            for(j=0; j<STRIDE_8BB; j++)
+            for(j=0; j<STRIDE_PBB; j++)
              {
-                x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
-                x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
-                x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
              }
          }
      }
@@ -3821,7 +3910,7 @@ static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
  
          for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
          {
-            for(j=0; j<4; j++)
+            for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
              {
                  fprintf(fp,"  sj %5d  imask %x\n",
                          nbl->cj4[j4].cj[j],
@@ -4756,7 +4845,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
      }
      else
      {
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
          nbs->icell_set_x = icell_set_x_supersub_sse8;
  #else
          nbs->icell_set_x = icell_set_x_supersub;