#define BBU_Z 6
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
#ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
#define NBNXN_SEARCH_SSE_SINGLE
#endif
/* Include basic SSE2 stuff */
#include <emmintrin.h>
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
#endif
/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
* Here AVX-256 turns out to be slightly slower than AVX-128.
*/
-#define STRIDE_8BB 4
-#define STRIDE_8BB_2LOG 2
+#define STRIDE_PBB 4
+#define STRIDE_PBB_2LOG 2
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef GMX_NBNXN_SIMD
#define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
#define NBNXN_BBXXXX
/* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
#endif
/* We shift the i-particles backward for PBC.
static int set_grid_size_xy(const nbnxn_search_t nbs,
nbnxn_grid_t *grid,
+ int dd_zone,
int n,rvec corner0,rvec corner1,
real atom_density,
int XFormat)
grid->ncy = 1;
}
+ grid->sx = size[XX]/grid->ncx;
+ grid->sy = size[YY]/grid->ncy;
+ grid->inv_sx = 1/grid->sx;
+ grid->inv_sy = 1/grid->sy;
+
+ if (dd_zone > 0)
+ {
+ /* This is a non-home zone, add an extra row of cells
+ * for particles communicated for bonded interactions.
+ * These can be beyond the cut-off. It doesn't matter where
+ * they end up on the grid, but for performance it's better
+ * if they don't end up in cells that can be within cut-off range.
+ */
+ grid->ncx++;
+ grid->ncy++;
+ }
+
/* We need one additional cell entry for particles moved by DD */
if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
{
grid->nc_nalloc = over_alloc_large(nc_max);
srenew(grid->nsubc,grid->nc_nalloc);
srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
- bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+ bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
#else
bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
#endif
copy_rvec(corner0,grid->c0);
copy_rvec(corner1,grid->c1);
- grid->sx = size[XX]/grid->ncx;
- grid->sy = size[YY]/grid->ncy;
- grid->inv_sx = 1/grid->sx;
- grid->inv_sy = 1/grid->sy;
return nc_max;
}
-#define SORT_GRID_OVERSIZE 2
+/* We need to sort paricles in grid columns on z-coordinate.
+ * As particle are very often distributed homogeneously, we a sorting
+ * algorithm similar to pigeonhole sort. We multiply the z-coordinate
+ * by a factor, cast to an int and try to store in that hole. If the hole
+ * is full, we move this or another particle. A second pass is needed to make
+ * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
+ * 4 is the optimal value for homogeneous particle distribution and allows
+ * for an O(#particles) sort up till distributions were all particles are
+ * concentrated in 1/4 of the space. No NlogN fallback is implemented,
+ * as it can be expensive to detect imhomogeneous particle distributions.
+ * SGSF is the maximum ratio of holes used, in the worst case all particles
+ * end up in the last hole and we need #particles extra holes at the end.
+ */
+#define SORT_GRID_OVERSIZE 4
#define SGSF (SORT_GRID_OVERSIZE + 1)
+/* Sort particle index a on coordinates x along dim.
+ * Backwards tells if we want decreasing iso increasing coordinates.
+ * h0 is the minimum of the coordinate range.
+ * invh is the inverse hole spacing.
+ * nsort, the theortical hole limit, is only used for debugging.
+ * sort is the sorting work array.
+ */
static void sort_atoms(int dim,gmx_bool Backwards,
int *a,int n,rvec *x,
real h0,real invh,int nsort,int *sort)
{
int i,c;
- int zi,zim;
+ int zi,zim,zi_min,zi_max;
int cp,tmp;
if (n <= 1)
return;
}
- /* For small oversize factors clearing the whole area is fastest.
- * For large oversize we should clear the used elements after use.
- */
- for(i=0; i<nsort; i++)
- {
- sort[i] = -1;
- }
+ /* Determine the index range used, so we can limit it for the second pass */
+ zi_min = INT_MAX;
+ zi_max = -1;
+
/* Sort the particles using a simple index sort */
for(i=0; i<n; i++)
{
if (sort[zi] < 0)
{
sort[zi] = a[i];
+ zi_min = min(zi_min,zi);
+ zi_max = max(zi_max,zi);
}
else
{
zim++;
}
sort[zim] = cp;
+ zi_max = max(zi_max,zim);
}
sort[zi] = a[i];
+ zi_max = max(zi_max,zi);
}
}
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
else
{
- for(zi=nsort-1; zi>=0; zi--)
+ for(zi=zi_max; zi>=zi_min; zi--)
{
if (sort[zi] >= 0)
{
a[c++] = sort[zi];
+ sort[zi] = -1;
}
}
}
bb[BBU_Z] = R2F_U(zh);
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na,const real *x,
i += stride;
}
/* Note: possible double to float conversion here */
- bb[0*STRIDE_8BB] = R2F_D(xl);
- bb[1*STRIDE_8BB] = R2F_D(yl);
- bb[2*STRIDE_8BB] = R2F_D(zl);
- bb[3*STRIDE_8BB] = R2F_U(xh);
- bb[4*STRIDE_8BB] = R2F_U(yh);
- bb[5*STRIDE_8BB] = R2F_U(zh);
+ bb[0*STRIDE_PBB] = R2F_D(xl);
+ bb[1*STRIDE_PBB] = R2F_D(yl);
+ bb[2*STRIDE_PBB] = R2F_D(zl);
+ bb[3*STRIDE_PBB] = R2F_U(xh);
+ bb[4*STRIDE_PBB] = R2F_U(yh);
+ bb[5*STRIDE_PBB] = R2F_U(zh);
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef NBNXN_SEARCH_SSE_SINGLE
{
calc_bounding_box_sse(na,x,bb_work);
- bb[0*STRIDE_8BB] = bb_work[BBL_X];
- bb[1*STRIDE_8BB] = bb_work[BBL_Y];
- bb[2*STRIDE_8BB] = bb_work[BBL_Z];
- bb[3*STRIDE_8BB] = bb_work[BBU_X];
- bb[4*STRIDE_8BB] = bb_work[BBU_Y];
- bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+ bb[0*STRIDE_PBB] = bb_work[BBL_X];
+ bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+ bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+ bb[3*STRIDE_PBB] = bb_work[BBU_X];
+ bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+ bb[5*STRIDE_PBB] = bb_work[BBU_Z];
}
#endif /* NBNXN_SEARCH_SSE_SINGLE */
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Combines pairs of consecutive bounding boxes */
static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
for(c=0; c<grid->nc; c++)
{
#ifdef NBNXN_BBXXXX
- for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+ for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
{
int cs_w,i,d;
- cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
- for(i=0; i<STRIDE_8BB; i++)
+ cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+ for(i=0; i<STRIDE_PBB; i++)
{
for(d=0; d<DIM; d++)
{
ba[d] +=
- grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
- grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_8BB+i];
+ grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
}
}
}
offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
bb_ptr = grid->bb + offset;
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
if (2*grid->na_cj == grid->na_c)
{
calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
*/
bb_ptr =
grid->bb +
- ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
- (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+ ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+ (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
#ifdef NBNXN_SEARCH_SSE_SINGLE
if (nbat->XFormat == nbatXYZQ)
{
fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx,sy,sz,
- bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
- bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
- bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+ bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+ bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+ bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
}
}
#endif
/* Determine in which grid column atoms should go */
static void calc_column_indices(nbnxn_grid_t *grid,
int a0,int a1,
- rvec *x,const int *move,
+ rvec *x,
+ int dd_zone,const int *move,
int thread,int nthread,
int *cell,
int *cxy_na)
n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- if (move == NULL || move[i] >= 0)
+ /* Home zone */
+ for(i=n0; i<n1; i++)
{
- /* We need to be careful with rounding,
- * particles might be a few bits outside the local box.
- * The int cast takes care of the lower bound,
- * we need to explicitly take care of the upper bound.
- */
- cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
- if (cx == grid->ncx)
- {
- cx = grid->ncx - 1;
- }
- cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
- if (cy == grid->ncy)
+ if (move == NULL || move[i] >= 0)
{
- cy = grid->ncy - 1;
- }
- /* For the moment cell contains only the, grid local,
- * x and y indices, not z.
- */
- cell[i] = cx*grid->ncy + cy;
+ /* We need to be careful with rounding,
+ * particles might be a few bits outside the local zone.
+ * The int cast takes care of the lower bound,
+ * we will explicitly take care of the upper bound.
+ */
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
#ifdef DEBUG_NBNXN_GRIDDING
- if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+ if (cx < 0 || cx >= grid->ncx ||
+ cy < 0 || cy >= grid->ncy)
+ {
+ gmx_fatal(FARGS,
+ "grid cell cx %d cy %d out of range (max %d %d)\n"
+ "atom %f %f %f, grid->c0 %f %f",
+ cx,cy,grid->ncx,grid->ncy,
+ x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ }
+#endif
+ /* Take care of potential rouding issues */
+ cx = min(cx,grid->ncx - 1);
+ cy = min(cy,grid->ncy - 1);
+
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+ }
+ else
{
- gmx_fatal(FARGS,
- "grid cell cx %d cy %d out of range (max %d %d)\n"
- "atom %f %f %f, grid->c0 %f %f",
- cx,cy,grid->ncx,grid->ncy,
- x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
+ /* Put this moved particle after the end of the grid,
+ * so we can process it later without using conditionals.
+ */
+ cell[i] = grid->ncx*grid->ncy;
}
-#endif
+
+ cxy_na[cell[i]]++;
}
- else
+ }
+ else
+ {
+ /* Non-home zone */
+ for(i=n0; i<n1; i++)
{
- /* Put this moved particle after the end of the grid,
- * so we can process it later without using conditionals.
+ cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+ cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+
+ /* For non-home zones there could be particles outside
+ * the non-bonded cut-off range, which have been communicated
+ * for bonded interactions only. For the result it doesn't
+ * matter where these end up on the grid. For performance
+ * we put them in an extra row at the border.
*/
- cell[i] = grid->ncx*grid->ncy;
- }
+ cx = max(cx,0);
+ cx = min(cx,grid->ncx - 1);
+ cy = max(cy,0);
+ cy = min(cy,grid->ncy - 1);
- cxy_na[cell[i]]++;
+ /* For the moment cell will contain only the, grid local,
+ * x and y indices, not z.
+ */
+ cell[i] = cx*grid->ncy + cy;
+
+ cxy_na[cell[i]]++;
+ }
}
}
#pragma omp parallel for num_threads(nthread) schedule(static)
for(thread=0; thread<nthread; thread++)
{
- calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+ calc_column_indices(grid,a0,a1,x,dd_zone,move,thread,nthread,
nbs->cell,nbs->work[thread].cxy_na);
}
over_alloc_large(ncz_max*grid->na_sc*SGSF);
srenew(nbs->work[thread].sort_work,
nbs->work[thread].sort_work_nalloc);
+ /* When not in use, all elements should be -1 */
+ for(i=0; i<nbs->work[thread].sort_work_nalloc; i++)
+ {
+ nbs->work[thread].sort_work[i] = -1;
+ }
}
}
nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
}
- /* Set the cell indices for the moved particles */
- n0 = grid->nc*grid->na_sc;
- n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
- for(i=n0; i<n1; i++)
+ if (dd_zone == 0)
{
- nbs->cell[nbs->a[i]] = i;
+ /* Set the cell indices for the moved particles */
+ n0 = grid->nc*grid->na_sc;
+ n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+ if (dd_zone == 0)
+ {
+ for(i=n0; i<n1; i++)
+ {
+ nbs->cell[nbs->a[i]] = i;
+ }
+ }
}
/* Sort the super-cell columns along z into the sub-cells. */
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb);
nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
}
- nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+ nc_max_grid = set_grid_size_xy(nbs,grid,
+ dd_zone,n-nmoved,corner0,corner1,
nbs->grid[0].atom_density,
nbat->XFormat);
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb_simple);
return d2;
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
static float subc_bb_dist2_sse(int na_c,
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \
- yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \
- zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \
- xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \
- yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \
- zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \
+ xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \
\
dx_0 = _mm_sub_ps(xi_l,xj_h); \
dy_0 = _mm_sub_ps(yi_l,yj_h); \
zero = _mm_setzero_ps();
- xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
- yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
- zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
- xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
- yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
- zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+ xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+ yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+ zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+ xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+ yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+ zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
- /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+ /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
* But as we know the number of iterations is 1 or 2, we unroll manually.
*/
SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
- if (STRIDE_8BB < nsi)
+ if (STRIDE_PBB < nsi)
{
- SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+ SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
}
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
/* Plain C function which determines if any atom pair between two cells
* is within distance sqrt(rl2).
rc2_SSE = _mm_set1_ps(rl2);
- na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
- ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
- iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
- iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
- ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
- iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
- iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+ na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+ ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+ iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+ iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+ ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+ iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+ iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
/* Returns the j sub-cell for index cj_ind */
static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
}
/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].imei[0].imask;
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
}
/* Ensures there is enough space for extra extra exclusion masks */
/* We can store 4 j-subcell - i-supercell pairs in one struct.
* since we round down, we need one extra entry.
*/
- ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+ ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
if (ncj4_max > nbl->cj4_nalloc)
{
snew(nbl->work,1);
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
#else
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+ snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
- snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
- snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+ snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+ snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
}
void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
fprintf(fp,"nbl average j super cell list length %.1f\n",
0.25*nbl->ncj4/(double)nbl->nsci);
fprintf(fp,"nbl average i sub cell list length %.1f\n",
- nbl->nci_tot/(0.25*nbl->ncj4));
+ nbl->nci_tot/((double)nbl->ncj4));
for(si=0; si<=GPU_NSUBCELL; si++)
{
{
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
b = 0;
for(si=0; si<GPU_NSUBCELL; si++)
w = (ej>>2);
for(ei=ej; ei<nbl->na_ci; ei++)
{
- excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
- ~(1U << (sj_offset*GPU_NSUBCELL+si));
+ excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+ ~(1U << (sj_offset*GPU_NSUBCELL + si));
}
}
}
for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
{
- cj4_ind = (nbl->work->cj_ind >> 2);
+ cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
cj4 = &nbl->cj4[cj4_ind];
#ifdef NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SSE */
- subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+ subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
ci1,bb_ci,d2l);
*ndistc += na_c*2;
#endif
{
/* Avoid using function pointers here, as it's slower */
if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
!subc_in_range_sse8
#else
!subc_in_range_x
nbl->nci_tot += npair;
/* Increase the closing index in i super-cell list */
- nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+ nbl->sci[nbl->nsci].cj4_ind_end =
+ ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
}
}
}
ndirect++;
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
else
{
while (cj_ind_first + ndirect <= cj_ind_last &&
inner_e = ge - se*na_c;
/* Macro for getting the index of atom a within a cluster */
-#define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
/* Macro for converting an atom number to a cluster number */
-#define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
- if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+ if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
{
w = (inner_e >> 2);
- get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+ get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
- nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
- ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+ nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+ ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
}
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
}
}
}
{
sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
- if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
{
- nbl->work->ncj_hlj += jlen;
+ nbl->work->ncj_noq += jlen;
}
- else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+ else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
{
- nbl->work->ncj_noq += jlen;
+ nbl->work->ncj_hlj += jlen;
}
nbl->nci++;
/* We can only have complete blocks of 4 j-entries in a list,
* so round the count up before closing.
*/
- nbl->ncj4 = ((nbl->work->cj_ind + 4-1) >> 2);
+ nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
nbl->nsci++;
int ia,m,i;
#ifdef NBNXN_BBXXXX
- ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
- for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+ ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+ for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
{
- for(i=0; i<STRIDE_8BB; i++)
+ for(i=0; i<STRIDE_PBB; i++)
{
- bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
- bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
- bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
- bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
- bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
- bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+ bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+ bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+ bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+ bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+ bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+ bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
}
}
#else
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Copies PBC shifted super-cell packed atom coordinates to working array */
static void icell_set_x_supersub_sse8(int ci,
real shx,real shy,real shz,
for(si=0; si<GPU_NSUBCELL; si++)
{
- for(i=0; i<na_c; i+=STRIDE_8BB)
+ for(i=0; i<na_c; i+=STRIDE_PBB)
{
io = si*na_c + i;
ia = ci*GPU_NSUBCELL*na_c + io;
- for(j=0; j<STRIDE_8BB; j++)
+ for(j=0; j<STRIDE_PBB; j++)
{
- x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
- x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
- x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+ x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+ x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+ x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
}
}
}
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
fprintf(fp," sj %5d imask %x\n",
nbl->cj4[j4].cj[j],
}
else
{
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
nbs->icell_set_x = icell_set_x_supersub_sse8;
#else
nbs->icell_set_x = icell_set_x_supersub;