The nbnxn search code now uses a well organized bounding box data
structure instead of a complicated indexed plain float array
for the cluster bounding boxes.
Change-Id: Ia5adf2d33d495cff3178ca950e8c16aefcfef1fb
+/* Pair search box lower and upper corner in x,y,z.
+ * Store this in 4 iso 3 reals, which is useful with SSE.
+ * To avoid complicating the code we also use 4 without SSE.
+ */
+#define NNBSBB_C 4
+/* Pair search box lower and upper bound in z only. */
+#define NNBSBB_D 2
+/* Pair search box lower and upper corner x,y,z indices, entry 3 is unused */
+#define BB_X 0
+#define BB_Y 1
+#define BB_Z 2
+
+/* Bounding box for a nbnxn atom cluster */
+typedef struct {
+ float lower[NNBSBB_C];
+ float upper[NNBSBB_C];
+} nbnxn_bb_t;
+
+
/* A pair-search grid struct for one domain decomposition zone */
typedef struct {
rvec c0; /* The lower corner of the (local) grid */
/* A pair-search grid struct for one domain decomposition zone */
typedef struct {
rvec c0; /* The lower corner of the (local) grid */
int *cxy_ind; /* Grid (super)cell index, offset from cell0 */
int cxy_nalloc; /* Allocation size for cxy_na and cxy_ind */
int *cxy_ind; /* Grid (super)cell index, offset from cell0 */
int cxy_nalloc; /* Allocation size for cxy_na and cxy_ind */
- int *nsubc; /* The number of sub cells for each super cell */
- float *bbcz; /* Bounding boxes in z for the super cells */
- float *bb; /* 3D bounding boxes for the sub cells */
- float *bbj; /* 3D j-b.boxes for SSE-double or AVX-single */
- int *flags; /* Flag for the super cells */
- int nc_nalloc; /* Allocation size for the pointers above */
+ int *nsubc; /* The number of sub cells for each super cell */
+ float *bbcz; /* Bounding boxes in z for the super cells */
+ nbnxn_bb_t *bb; /* 3D bounding boxes for the sub cells */
+ nbnxn_bb_t *bbj; /* 3D j-b.boxes for SSE-double or AVX-single */
+ float *pbb; /* 3D b. boxes in xxxx format per super cell */
+ int *flags; /* Flag for the super cells */
+ int nc_nalloc; /* Allocation size for the pointers above */
- float *bbcz_simple; /* bbcz for simple grid converted from super */
- float *bb_simple; /* bb for simple grid converted from super */
- int *flags_simple; /* flags for simple grid converted from super */
- int nc_nalloc_simple; /* Allocation size for the pointers above */
+ float *bbcz_simple; /* bbcz for simple grid converted from super */
+ nbnxn_bb_t *bb_simple; /* bb for simple grid converted from super */
+ int *flags_simple; /* flags for simple grid converted from super */
+ int nc_nalloc_simple; /* Allocation size for the pointers above */
int nsubc_tot; /* Total number of subcell, used for printing */
} nbnxn_grid_t;
int nsubc_tot; /* Total number of subcell, used for printing */
} nbnxn_grid_t;
/* Working data for the actual i-supercell during pair search */
typedef struct nbnxn_list_work {
/* Working data for the actual i-supercell during pair search */
typedef struct nbnxn_list_work {
- gmx_cache_protect_t cp0; /* Protect cache between threads */
+ gmx_cache_protect_t cp0; /* Protect cache between threads */
- float *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
- real *x_ci; /* The coordinates, pbc shifted, for each atom */
+ nbnxn_bb_t *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
+ float *pbb_ci; /* As bb_ci, but in xxxx packed format */
+ real *x_ci; /* The coordinates, pbc shifted, for each atom */
#ifdef GMX_NBNXN_SIMD
nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
#ifdef GMX_NBNXN_SIMD
nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
-/* Pair search box lower and upper corner in x,y,z.
- * Store this in 4 iso 3 reals, which is useful with SSE.
- * To avoid complicating the code we also use 4 without SSE.
- */
-#define NNBSBB_C 4
-#define NNBSBB_B (2*NNBSBB_C)
-/* Pair search box lower and upper bound in z only. */
-#define NNBSBB_D 2
-/* Pair search box lower and upper corner x,y,z indices */
-#define BBL_X 0
-#define BBL_Y 1
-#define BBL_Z 2
-#define BBU_X 4
-#define BBU_Y 5
-#define BBU_Z 6
-
-
#ifdef NBNXN_SEARCH_BB_SSE
/* We use SSE or AVX-128bit for bounding box calculations */
#ifdef NBNXN_SEARCH_BB_SSE
/* We use SSE or AVX-128bit for bounding box calculations */
if (nc_max > grid->nc_nalloc)
{
if (nc_max > grid->nc_nalloc)
{
grid->nc_nalloc = over_alloc_large(nc_max);
srenew(grid->nsubc, grid->nc_nalloc);
srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
grid->nc_nalloc = over_alloc_large(nc_max);
srenew(grid->nsubc, grid->nc_nalloc);
srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_PBB_SSE
- bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
-#else
- bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
-#endif
sfree_aligned(grid->bb);
/* This snew also zeros the contents, this avoid possible
* floating exceptions in SSE with the unused bb elements.
*/
sfree_aligned(grid->bb);
/* This snew also zeros the contents, this avoid possible
* floating exceptions in SSE with the unused bb elements.
*/
- snew_aligned(grid->bb, bb_nalloc, 16);
+ if (grid->bSimple)
+ {
+ snew_aligned(grid->bb, grid->nc_nalloc, 16);
+ }
+ else
+ {
+#ifdef NBNXN_BBXXXX
+ int pbb_nalloc;
+
+ pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
+ snew_aligned(grid->pbb, pbb_nalloc, 16);
+#else
+ snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16);
+#endif
+ }
else
{
sfree_aligned(grid->bbj);
else
{
sfree_aligned(grid->bbj);
- snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
+ snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
#endif
/* Coordinate order x,y,z, bb order xyz0 */
#endif
/* Coordinate order x,y,z, bb order xyz0 */
-static void calc_bounding_box(int na, int stride, const real *x, float *bb)
+static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
{
int i, j;
real xl, xh, yl, yh, zl, zh;
{
int i, j;
real xl, xh, yl, yh, zl, zh;
i += stride;
}
/* Note: possible double to float conversion here */
i += stride;
}
/* Note: possible double to float conversion here */
- bb[BBL_X] = R2F_D(xl);
- bb[BBL_Y] = R2F_D(yl);
- bb[BBL_Z] = R2F_D(zl);
- bb[BBU_X] = R2F_U(xh);
- bb[BBU_Y] = R2F_U(yh);
- bb[BBU_Z] = R2F_U(zh);
+ bb->lower[BB_X] = R2F_D(xl);
+ bb->lower[BB_Y] = R2F_D(yl);
+ bb->lower[BB_Z] = R2F_D(zl);
+ bb->upper[BB_X] = R2F_U(xh);
+ bb->upper[BB_Y] = R2F_U(yh);
+ bb->upper[BB_Z] = R2F_U(zh);
}
/* Packed coordinates, bb order xyz0 */
}
/* Packed coordinates, bb order xyz0 */
-static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
+static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
{
int j;
real xl, xh, yl, yh, zl, zh;
{
int j;
real xl, xh, yl, yh, zl, zh;
zh = max(zh, x[j+ZZ*PACK_X4]);
}
/* Note: possible double to float conversion here */
zh = max(zh, x[j+ZZ*PACK_X4]);
}
/* Note: possible double to float conversion here */
- bb[BBL_X] = R2F_D(xl);
- bb[BBL_Y] = R2F_D(yl);
- bb[BBL_Z] = R2F_D(zl);
- bb[BBU_X] = R2F_U(xh);
- bb[BBU_Y] = R2F_U(yh);
- bb[BBU_Z] = R2F_U(zh);
+ bb->lower[BB_X] = R2F_D(xl);
+ bb->lower[BB_Y] = R2F_D(yl);
+ bb->lower[BB_Z] = R2F_D(zl);
+ bb->upper[BB_X] = R2F_U(xh);
+ bb->upper[BB_Y] = R2F_U(yh);
+ bb->upper[BB_Z] = R2F_U(zh);
}
/* Packed coordinates, bb order xyz0 */
}
/* Packed coordinates, bb order xyz0 */
-static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
+static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
{
int j;
real xl, xh, yl, yh, zl, zh;
{
int j;
real xl, xh, yl, yh, zl, zh;
zh = max(zh, x[j+ZZ*PACK_X8]);
}
/* Note: possible double to float conversion here */
zh = max(zh, x[j+ZZ*PACK_X8]);
}
/* Note: possible double to float conversion here */
- bb[BBL_X] = R2F_D(xl);
- bb[BBL_Y] = R2F_D(yl);
- bb[BBL_Z] = R2F_D(zl);
- bb[BBU_X] = R2F_U(xh);
- bb[BBU_Y] = R2F_U(yh);
- bb[BBU_Z] = R2F_U(zh);
+ bb->lower[BB_X] = R2F_D(xl);
+ bb->lower[BB_Y] = R2F_D(yl);
+ bb->lower[BB_Z] = R2F_D(zl);
+ bb->upper[BB_X] = R2F_U(xh);
+ bb->upper[BB_Y] = R2F_U(yh);
+ bb->upper[BB_Z] = R2F_U(zh);
}
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na, const real *x,
}
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na, const real *x,
+ nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
-#ifndef NBNXN_SEARCH_BB_SSE
- int i;
-#endif
-
calc_bounding_box_x_x4(min(na, 2), x, bbj);
if (na > 2)
{
calc_bounding_box_x_x4(min(na, 2), x, bbj);
if (na > 2)
{
- calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
+ calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+1);
* so we don't need to treat special cases in the rest of the code.
*/
#ifdef NBNXN_SEARCH_BB_SSE
* so we don't need to treat special cases in the rest of the code.
*/
#ifdef NBNXN_SEARCH_BB_SSE
- _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
- _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
+ _mm_store_ps(&bbj[1].lower[0], _mm_load_ps(&bbj[0].lower[0]));
+ _mm_store_ps(&bbj[1].upper[0], _mm_load_ps(&bbj[0].upper[0]));
- for (i = 0; i < NNBSBB_B; i++)
- {
- bbj[NNBSBB_B + i] = bbj[i];
- }
#endif
}
#ifdef NBNXN_SEARCH_BB_SSE
#endif
}
#ifdef NBNXN_SEARCH_BB_SSE
- _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
- _mm_load_ps(bbj+NNBSBB_B)));
- _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
- _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
+ _mm_store_ps(&bb->lower[0], _mm_min_ps(_mm_load_ps(&bbj[0].lower[0]),
+ _mm_load_ps(&bbj[1].lower[0])));
+ _mm_store_ps(&bb->upper[0], _mm_max_ps(_mm_load_ps(&bbj[0].upper[0]),
+ _mm_load_ps(&bbj[1].upper[0])));
- for (i = 0; i < NNBSBB_C; i++)
- bb[ i] = min(bbj[ i], bbj[NNBSBB_B + i]);
- bb[NNBSBB_C + i] = max(bbj[NNBSBB_C + i], bbj[NNBSBB_B + NNBSBB_C + i]);
+ int i;
+
+ for (i = 0; i < NNBSBB_C; i++)
+ {
+ bb->lower[i] = min(bbj[0].lower[i], bbj[1].lower[i]);
+ bb->upper[i] = max(bbj[0].upper[i], bbj[1].upper[i]);
+ }
#ifdef NBNXN_SEARCH_SSE_SINGLE
/* Coordinate order xyz?, bb order xyz0 */
#ifdef NBNXN_SEARCH_SSE_SINGLE
/* Coordinate order xyz?, bb order xyz0 */
-static void calc_bounding_box_sse(int na, const float *x, float *bb)
+static void calc_bounding_box_sse(int na, const float *x, nbnxn_bb_t *bb)
{
__m128 bb_0_SSE, bb_1_SSE;
__m128 x_SSE;
{
__m128 bb_0_SSE, bb_1_SSE;
__m128 x_SSE;
bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
}
bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
}
- _mm_store_ps(bb, bb_0_SSE);
- _mm_store_ps(bb+4, bb_1_SSE);
+ _mm_store_ps(&bb->lower[0], bb_0_SSE);
+ _mm_store_ps(&bb->upper[0], bb_1_SSE);
}
/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
static void calc_bounding_box_xxxx_sse(int na, const float *x,
}
/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
static void calc_bounding_box_xxxx_sse(int na, const float *x,
+ nbnxn_bb_t *bb_work_aligned,
- calc_bounding_box_sse(na, x, bb_work);
-
- bb[0*STRIDE_PBB] = bb_work[BBL_X];
- bb[1*STRIDE_PBB] = bb_work[BBL_Y];
- bb[2*STRIDE_PBB] = bb_work[BBL_Z];
- bb[3*STRIDE_PBB] = bb_work[BBU_X];
- bb[4*STRIDE_PBB] = bb_work[BBU_Y];
- bb[5*STRIDE_PBB] = bb_work[BBU_Z];
+ calc_bounding_box_sse(na, x, bb_work_aligned);
+
+ bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
+ bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
+ bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
+ bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
+ bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
+ bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
}
#endif /* NBNXN_SEARCH_SSE_SINGLE */
/* Combines pairs of consecutive bounding boxes */
}
#endif /* NBNXN_SEARCH_SSE_SINGLE */
/* Combines pairs of consecutive bounding boxes */
-static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
+static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
{
int i, j, sc2, nc2, c2;
{
int i, j, sc2, nc2, c2;
#ifdef NBNXN_SEARCH_BB_SSE
__m128 min_SSE, max_SSE;
#ifdef NBNXN_SEARCH_BB_SSE
__m128 min_SSE, max_SSE;
- min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
- _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
- max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
- _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
- _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
- _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
+ min_SSE = _mm_min_ps(_mm_load_ps(&bb[c2*2+0].lower[0]),
+ _mm_load_ps(&bb[c2*2+1].lower[0]));
+ max_SSE = _mm_max_ps(_mm_load_ps(&bb[c2*2+0].upper[0]),
+ _mm_load_ps(&bb[c2*2+1].upper[0]));
+ _mm_store_ps(&grid->bbj[c2].lower[0], min_SSE);
+ _mm_store_ps(&grid->bbj[c2].upper[0], max_SSE);
#else
for (j = 0; j < NNBSBB_C; j++)
{
#else
for (j = 0; j < NNBSBB_C; j++)
{
- grid->bbj[(c2*2+0)*NNBSBB_C+j] = min(bb[(c2*4+0)*NNBSBB_C+j],
- bb[(c2*4+2)*NNBSBB_C+j]);
- grid->bbj[(c2*2+1)*NNBSBB_C+j] = max(bb[(c2*4+1)*NNBSBB_C+j],
- bb[(c2*4+3)*NNBSBB_C+j]);
+ grid->bbj[c2].lower[j] = min(bb[c2*2].lower[j],
+ bb[c2*2].lower[j]);
+ grid->bbj[c2].upper[j] = max(bb[c2*2].upper[j],
+ bb[c2*2].upper[j]);
/* Copy the last bb for odd bb count in this column */
for (j = 0; j < NNBSBB_C; j++)
{
/* Copy the last bb for odd bb count in this column */
for (j = 0; j < NNBSBB_C; j++)
{
- grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
- grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
+ grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
+ grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
{
for (d = 0; d < DIM; d++)
{
{
for (d = 0; d < DIM; d++)
{
- ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
+ ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
}
}
dsvmul(1.0/grid->nc, ba, ba);
}
}
dsvmul(1.0/grid->nc, ba, ba);
for (d = 0; d < DIM; d++)
{
ba[d] +=
for (d = 0; d < DIM; d++)
{
ba[d] +=
- grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
- grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
+ grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
cs = c*GPU_NSUBCELL + s;
for (d = 0; d < DIM; d++)
{
cs = c*GPU_NSUBCELL + s;
for (d = 0; d < DIM; d++)
{
- ba[d] +=
- grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
- grid->bb[cs*NNBSBB_B +d];
+ ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
const int *atinfo,
rvec *x,
int sx, int sy, int sz,
const int *atinfo,
rvec *x,
int sx, int sy, int sz,
+ nbnxn_bb_t *bb_work_aligned)
- int na, a;
- size_t offset;
- float *bb_ptr;
+ int na, a;
+ size_t offset;
+ nbnxn_bb_t *bb_ptr;
+#ifdef NBNXN_BBXXXX
+ float *pbb_ptr;
+#endif
if (nbat->XFormat == nbatX4)
{
/* Store the bounding boxes as xyz.xyz. */
if (nbat->XFormat == nbatX4)
{
/* Store the bounding boxes as xyz.xyz. */
- offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
bb_ptr = grid->bb + offset;
#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
bb_ptr = grid->bb + offset;
#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
else if (nbat->XFormat == nbatX8)
{
/* Store the bounding boxes as xyz.xyz. */
else if (nbat->XFormat == nbatX8)
{
/* Store the bounding boxes as xyz.xyz. */
- offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
bb_ptr = grid->bb + offset;
calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
bb_ptr = grid->bb + offset;
calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
/* Store the bounding boxes in a format convenient
* for SSE calculations: xxxxyyyyzzzz...
*/
/* Store the bounding boxes in a format convenient
* for SSE calculations: xxxxyyyyzzzz...
*/
+ pbb_ptr =
+ grid->pbb +
((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
(((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
(((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
if (nbat->XFormat == nbatXYZQ)
{
calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
if (nbat->XFormat == nbatXYZQ)
{
calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
+ bb_work_aligned, pbb_ptr);
}
else
#endif
{
calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
}
else
#endif
{
calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
}
if (gmx_debug_at)
{
fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx, sy, sz,
}
if (gmx_debug_at)
{
fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx, sy, sz,
- bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
- bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
- bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
+ pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
+ pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
+ pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
}
}
#endif
else
{
/* Store the bounding boxes as xyz.xyz. */
}
}
#endif
else
{
/* Store the bounding boxes as xyz.xyz. */
- bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+ bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
bb_ptr);
calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
bb_ptr);
bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx, sy, sz,
bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx, sy, sz,
- (grid->bb+bbo*NNBSBB_B)[BBL_X],
- (grid->bb+bbo*NNBSBB_B)[BBU_X],
- (grid->bb+bbo*NNBSBB_B)[BBL_Y],
- (grid->bb+bbo*NNBSBB_B)[BBU_Y],
- (grid->bb+bbo*NNBSBB_B)[BBL_Z],
- (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
+ grid->bb[bbo].lower[BB_X],
+ grid->bb[bbo].lower[BB_Y],
+ grid->bb[bbo].lower[BB_Z],
+ grid->bb[bbo].upper[BB_X],
+ grid->bb[bbo].upper[BB_Y],
+ grid->bb[bbo].upper[BB_Z]);
- grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled*NNBSBB_B+2];
- grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
+ grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
+ grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
}
/* Set the unused atom indices to -1 */
}
/* Set the unused atom indices to -1 */
int subdiv_y, sub_y, na_y, ash_y;
int subdiv_x, sub_x, na_x, ash_x;
int subdiv_y, sub_y, na_y, ash_y;
int subdiv_x, sub_x, na_x, ash_x;
- /* cppcheck-suppress unassignedVariable */
- float bb_work_array[NNBSBB_B+3], *bb_work_align;
+ nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
- bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
+ bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));
grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
grid->na_c*sub_z,
grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
grid->na_c*sub_z,
nbnxn_atomdata_t *nbat)
{
nbnxn_grid_t *grid;
nbnxn_atomdata_t *nbat)
{
nbnxn_grid_t *grid;
+ float *bbcz;
+ nbnxn_bb_t *bb;
int ncd, sc;
grid = &nbs->grid[0];
int ncd, sc;
grid = &nbs->grid[0];
{
grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
{
grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
- srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
+ srenew(grid->bb_simple, grid->nc_nalloc_simple);
srenew(grid->flags_simple, grid->nc_nalloc_simple);
if (nbat->XFormat)
{
srenew(grid->flags_simple, grid->nc_nalloc_simple);
if (nbat->XFormat)
{
case nbatX4:
/* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
case nbatX4:
/* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
break;
case nbatX8:
/* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
break;
case nbatX8:
/* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
break;
default:
calc_bounding_box(na, nbat->xstride,
nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
break;
default:
calc_bounding_box(na, nbat->xstride,
nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
- bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B +ZZ];
- bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
+ bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
+ bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
/* No interaction optimization yet here */
grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
/* No interaction optimization yet here */
grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
/* Reference code calculating the distance^2 between two bounding boxes */
static float box_dist2(float bx0, float bx1, float by0,
float by1, float bz0, float bz1,
/* Reference code calculating the distance^2 between two bounding boxes */
static float box_dist2(float bx0, float bx1, float by0,
float by1, float bz0, float bz1,
{
float d2;
float dl, dh, dm, dm0;
d2 = 0;
{
float d2;
float dl, dh, dm, dm0;
d2 = 0;
- dl = bx0 - bb[BBU_X];
- dh = bb[BBL_X] - bx1;
+ dl = bx0 - bb->upper[BB_X];
+ dh = bb->lower[BB_X] - bx1;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
- dl = by0 - bb[BBU_Y];
- dh = bb[BBL_Y] - by1;
+ dl = by0 - bb->upper[BB_Y];
+ dh = bb->lower[BB_Y] - by1;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
- dl = bz0 - bb[BBU_Z];
- dh = bb[BBL_Z] - bz1;
+ dl = bz0 - bb->upper[BB_Z];
+ dh = bb->lower[BB_Z] - bz1;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
}
/* Plain C code calculating the distance^2 between two bounding boxes */
}
/* Plain C code calculating the distance^2 between two bounding boxes */
-static float subc_bb_dist2(int si, const float *bb_i_ci,
- int csj, const float *bb_j_all)
+static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
+ int csj, const nbnxn_bb_t *bb_j_all)
- const float *bb_i, *bb_j;
- float d2;
- float dl, dh, dm, dm0;
+ const nbnxn_bb_t *bb_i, *bb_j;
+ float d2;
+ float dl, dh, dm, dm0;
- bb_i = bb_i_ci + si*NNBSBB_B;
- bb_j = bb_j_all + csj*NNBSBB_B;
+ bb_i = bb_i_ci + si;
+ bb_j = bb_j_all + csj;
- dl = bb_i[BBL_X] - bb_j[BBU_X];
- dh = bb_j[BBL_X] - bb_i[BBU_X];
+ dl = bb_i->lower[BB_X] - bb_j->upper[BB_X];
+ dh = bb_j->lower[BB_X] - bb_i->upper[BB_X];
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
- dl = bb_i[BBL_Y] - bb_j[BBU_Y];
- dh = bb_j[BBL_Y] - bb_i[BBU_Y];
+ dl = bb_i->lower[BB_Y] - bb_j->upper[BB_Y];
+ dh = bb_j->lower[BB_Y] - bb_i->upper[BB_Y];
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
- dl = bb_i[BBL_Z] - bb_j[BBU_Z];
- dh = bb_j[BBL_Z] - bb_i[BBU_Z];
+ dl = bb_i->lower[BB_Z] - bb_j->upper[BB_Z];
+ dh = bb_j->lower[BB_Z] - bb_i->upper[BB_Z];
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
dm = max(dl, dh);
dm0 = max(dm, 0);
d2 += dm0*dm0;
#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
-static float subc_bb_dist2_sse(int si, const float *bb_i_ci,
- int csj, const float *bb_j_all)
+static float subc_bb_dist2_sse(int si, const nbnxn_bb_t *bb_i_ci,
+ int csj, const nbnxn_bb_t *bb_j_all)
- const float *bb_i, *bb_j;
-
__m128 bb_i_SSE0, bb_i_SSE1;
__m128 bb_j_SSE0, bb_j_SSE1;
__m128 dl_SSE;
__m128 bb_i_SSE0, bb_i_SSE1;
__m128 bb_j_SSE0, bb_j_SSE1;
__m128 dl_SSE;
- bb_i = bb_i_ci + si*NNBSBB_B;
- bb_j = bb_j_all + csj*NNBSBB_B;
-
- bb_i_SSE0 = _mm_load_ps(bb_i);
- bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
- bb_j_SSE0 = _mm_load_ps(bb_j);
- bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
+ bb_i_SSE0 = _mm_load_ps(&bb_i_ci[si].lower[0]);
+ bb_i_SSE1 = _mm_load_ps(&bb_i_ci[si].upper[0]);
+ bb_j_SSE0 = _mm_load_ps(&bb_j_all[csj].lower[0]);
+ bb_j_SSE1 = _mm_load_ps(&bb_j_all[csj].upper[0]);
dl_SSE = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
dh_SSE = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
dl_SSE = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
dh_SSE = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
+ if (nbl->bSimple)
+ {
+ snew_aligned(nbl->work->bb_ci, 1, NBNXN_MEM_ALIGN);
+ }
+ else
+ {
- snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
- snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
{
const nbnxn_list_work_t *work;
{
const nbnxn_list_work_t *work;
+ const nbnxn_bb_t *bb_ci;
const real *x_ci;
gmx_bool InRange;
const real *x_ci;
gmx_bool InRange;
int cj4_ind, cj_offset;
unsigned imask;
nbnxn_cj4_t *cj4;
int cj4_ind, cj_offset;
unsigned imask;
nbnxn_cj4_t *cj4;
+#ifdef NBNXN_BBXXXX
+ const float *pbb_ci;
+#else
+ const nbnxn_bb_t *bb_ci;
+#endif
const real *x_ci;
float *d2l, d2;
int w;
const real *x_ci;
float *d2l, d2;
int w;
- bb_ci = nbl->work->bb_ci;
- x_ci = nbl->work->x_ci;
+#ifdef NBNXN_BBXXXX
+ pbb_ci = nbl->work->pbb_ci;
+#else
+ bb_ci = nbl->work->bb_ci;
+#endif
+ x_ci = nbl->work->x_ci;
#ifdef NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SSE */
#ifdef NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SSE */
- subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
- ci1, bb_ci, d2l);
+ subc_bb_dist2_sse_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+ ci1, pbb_ci, d2l);
*ndistc += na_c*2;
#endif
*ndistc += na_c*2;
#endif
}
/* Sets a simple list i-cell bounding box, including PBC shift */
}
/* Sets a simple list i-cell bounding box, including PBC shift */
-static void set_icell_bb_simple(const float *bb, int ci,
- real shx, real shy, real shz,
- float *bb_ci)
+static gmx_inline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,
+ real shx, real shy, real shz,
+ nbnxn_bb_t *bb_ci)
- int ia;
-
- ia = ci*NNBSBB_B;
- bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
- bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
- bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
- bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
- bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
- bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
+ bb_ci->lower[BB_X] = bb[ci].lower[BB_X] + shx;
+ bb_ci->lower[BB_Y] = bb[ci].lower[BB_Y] + shy;
+ bb_ci->lower[BB_Z] = bb[ci].lower[BB_Z] + shz;
+ bb_ci->upper[BB_X] = bb[ci].upper[BB_X] + shx;
+ bb_ci->upper[BB_Y] = bb[ci].upper[BB_Y] + shy;
+ bb_ci->upper[BB_Z] = bb[ci].upper[BB_Z] + shz;
/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
-static void set_icell_bb_supersub(const float *bb, int ci,
- real shx, real shy, real shz,
- float *bb_ci)
+static void set_icell_bbxxxx_supersub(const float *bb, int ci,
+ real shx, real shy, real shz,
+ float *bb_ci)
ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
{
ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
{
bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
}
}
bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
}
}
-#else
- ia = ci*GPU_NSUBCELL*NNBSBB_B;
- for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
+}
+#endif
+
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+static void set_icell_bb_supersub(const nbnxn_bb_t *bb, int ci,
+ real shx, real shy, real shz,
+ nbnxn_bb_t *bb_ci)
+{
+ int i;
+
+ for (i = 0; i < GPU_NSUBCELL; i++)
- bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
- bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
- bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
- bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
- bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
- bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
+ set_icell_bb_simple(bb, ci*GPU_NSUBCELL+i,
+ shx, shy, shz,
+ &bb_ci[i]);
}
/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
}
/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
gmx_bool bMakeList;
real shx, shy, shz;
int conv_i, cell0_i;
gmx_bool bMakeList;
real shx, shy, shz;
int conv_i, cell0_i;
- const float *bb_i, *bbcz_i, *bbcz_j;
+ const nbnxn_bb_t *bb_i=NULL;
+#ifdef NBNXN_BBXXXX
+ const float *pbb_i=NULL;
+#endif
+ const float *bbcz_i, *bbcz_j;
const int *flags_i;
real bx0, bx1, by0, by1, bz0, bz1;
real bz1_frac;
const int *flags_i;
real bx0, bx1, by0, by1, bz0, bz1;
real bz1_frac;
+#ifdef NBNXN_BBXXXX
+ if (gridi->bSimple)
+ {
+ bb_i = gridi->bb;
+ }
+ else
+ {
+ pbb_i = gridi->pbb;
+ }
+#else
+ /* We use the normal bounding box format for both grid types */
+ bb_i = gridi->bb;
+#endif
bbcz_i = gridi->bbcz;
flags_i = gridi->flags;
}
bbcz_i = gridi->bbcz;
flags_i = gridi->flags;
}
- bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
+ bx1 = bb_i[ci].upper[BB_X];
- by0 = bb_i[ci*NNBSBB_B +YY] + shy;
- by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
+ by0 = bb_i[ci].lower[BB_Y] + shy;
+ by1 = bb_i[ci].upper[BB_Y] + shy;
- bx0 = bb_i[ci*NNBSBB_B +XX] + shx;
- bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
+ bx0 = bb_i[ci].lower[BB_X] + shx;
+ bx1 = bb_i[ci].upper[BB_X] + shx;
+#ifdef NBNXN_BBXXXX
+ set_icell_bbxxxx_supersub(pbb_i, ci, shx, shy, shz,
+ nbl->work->pbb_ci);
+#else
set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
nbl->work->bb_ci);
set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
nbl->work->bb_ci);
}
nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
}
nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
cl = -1;
for (k = c0; k < c1; k++)
{
cl = -1;
for (k = c0; k < c1; k++)
{
- if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
- bb+k*NNBSBB_B) < rl2 &&
+ if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
- if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
- bb+k*NNBSBB_B) < rl2 &&
+ if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
int *ndistc)
{
const nbnxn_x_ci_simd_2xnn_t *work;
int *ndistc)
{
const nbnxn_x_ci_simd_2xnn_t *work;
+ const nbnxn_bb_t *bb_ci;
gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
int *ndistc)
{
const nbnxn_x_ci_simd_4xn_t *work;
int *ndistc)
{
const nbnxn_x_ci_simd_4xn_t *work;
+ const nbnxn_bb_t *bb_ci;
gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;