From ebb29284f3586ec3281e9ada00b0237c9d765f2d Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 1 Mar 2019 13:01:30 +0100 Subject: [PATCH] Convert nbnxm bounding box to a struct Change-Id: Ia432e1f242b90ec0dbe01c528125bb7fa955bd5f --- src/gromacs/nbnxm/grid.cpp | 154 +++++++++++++++------------------ src/gromacs/nbnxm/grid.h | 84 +++++++++++++----- src/gromacs/nbnxm/pairlist.cpp | 58 ++++++------- 3 files changed, 164 insertions(+), 132 deletions(-) diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp index e971e0f34b..473b9e1eba 100644 --- a/src/gromacs/nbnxm/grid.cpp +++ b/src/gromacs/nbnxm/grid.cpp @@ -462,12 +462,12 @@ static void calc_bounding_box(int na, int stride, const real *x, i += stride; } /* Note: possible double to float conversion here */ - bb->lower[BB_X] = R2F_D(xl); - bb->lower[BB_Y] = R2F_D(yl); - bb->lower[BB_Z] = R2F_D(zl); - bb->upper[BB_X] = R2F_U(xh); - bb->upper[BB_Y] = R2F_U(yh); - bb->upper[BB_Z] = R2F_U(zh); + bb->lower.x = R2F_D(xl); + bb->lower.y = R2F_D(yl); + bb->lower.z = R2F_D(zl); + bb->upper.x = R2F_U(xh); + bb->upper.y = R2F_U(yh); + bb->upper.z = R2F_U(zh); } /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */ @@ -492,12 +492,12 @@ static void calc_bounding_box_x_x4(int na, const real *x, zh = std::max(zh, x[j+ZZ*c_packX4]); } /* Note: possible double to float conversion here */ - bb->lower[BB_X] = R2F_D(xl); - bb->lower[BB_Y] = R2F_D(yl); - bb->lower[BB_Z] = R2F_D(zl); - bb->upper[BB_X] = R2F_U(xh); - bb->upper[BB_Y] = R2F_U(yh); - bb->upper[BB_Z] = R2F_U(zh); + bb->lower.x = R2F_D(xl); + bb->lower.y = R2F_D(yl); + bb->lower.z = R2F_D(zl); + bb->upper.x = R2F_U(xh); + bb->upper.y = R2F_U(yh); + bb->upper.z = R2F_U(zh); } /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */ @@ -522,12 +522,12 @@ static void calc_bounding_box_x_x8(int na, const real *x, zh = std::max(zh, x[j+ZZ*c_packX8]); } /* Note: possible double to float conversion here */ - bb->lower[BB_X] = R2F_D(xl); - bb->lower[BB_Y] = R2F_D(yl); - bb->lower[BB_Z] = R2F_D(zl); - bb->upper[BB_X] = R2F_U(xh); - bb->upper[BB_Y] = R2F_U(yh); - bb->upper[BB_Z] = R2F_U(zh); + bb->lower.x = R2F_D(xl); + bb->lower.y = R2F_D(yl); + bb->lower.z = R2F_D(zl); + bb->upper.x = R2F_U(xh); + bb->upper.y = R2F_U(yh); + bb->upper.z = R2F_U(zh); } /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */ @@ -550,25 +550,24 @@ gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x, * so we don't need to treat special cases in the rest of the code. */ #if NBNXN_SEARCH_BB_SIMD4 - store4(&bbj[1].lower[0], load4(&bbj[0].lower[0])); - store4(&bbj[1].upper[0], load4(&bbj[0].upper[0])); + store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr())); + store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr())); #else bbj[1] = bbj[0]; #endif } #if NBNXN_SEARCH_BB_SIMD4 - store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0]))); - store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0]))); + store4(bb->lower.ptr(), + min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr()))); + store4(bb->upper.ptr(), + max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr()))); #else { - int i; - - for (i = 0; i < NNBSBB_C; i++) - { - bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]); - bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]); - } + bb->lower = BoundingBox::Corner::min(bbj[0].lower, + bbj[1].lower); + bb->upper = BoundingBox::Corner::max(bbj[0].upper, + bbj[1].upper); } #endif } @@ -619,21 +618,21 @@ static void calc_bounding_box_simd4(int na, const float *x, // TODO: During SIMDv2 transition only some archs use namespace (remove when done) using namespace gmx; - Simd4Float bb_0_S, bb_1_S; - Simd4Float x_S; + static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH*sizeof(float), + "The Corner struct should hold exactly 4 floats"); - bb_0_S = load4(x); - bb_1_S = bb_0_S; + Simd4Float bb_0_S = load4(x); + Simd4Float bb_1_S = bb_0_S; for (int i = 1; i < na; i++) { - x_S = load4(x+i*NNBSBB_C); - bb_0_S = min(bb_0_S, x_S); - bb_1_S = max(bb_1_S, x_S); + Simd4Float x_S = load4(x + i*GMX_SIMD4_WIDTH); + bb_0_S = min(bb_0_S, x_S); + bb_1_S = max(bb_1_S, x_S); } - store4(&bb->lower[0], bb_0_S); - store4(&bb->upper[0], bb_1_S); + store4(bb->lower.ptr(), bb_0_S); + store4(bb->upper.ptr(), bb_1_S); } /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */ @@ -643,12 +642,12 @@ static void calc_bounding_box_xxxx_simd4(int na, const float *x, { calc_bounding_box_simd4(na, x, bb_work_aligned); - bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X]; - bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y]; - bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z]; - bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X]; - bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y]; - bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z]; + bb[0*STRIDE_PBB] = bb_work_aligned->lower.x; + bb[1*STRIDE_PBB] = bb_work_aligned->lower.y; + bb[2*STRIDE_PBB] = bb_work_aligned->lower.z; + bb[3*STRIDE_PBB] = bb_work_aligned->upper.x; + bb[4*STRIDE_PBB] = bb_work_aligned->upper.y; + bb[5*STRIDE_PBB] = bb_work_aligned->upper.z; } #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */ @@ -674,30 +673,24 @@ static void combine_bounding_box_pairs(const Grid &grid, #if NBNXN_SEARCH_BB_SIMD4 Simd4Float min_S, max_S; - min_S = min(load4(&bb[c2*2+0].lower[0]), - load4(&bb[c2*2+1].lower[0])); - max_S = max(load4(&bb[c2*2+0].upper[0]), - load4(&bb[c2*2+1].upper[0])); - store4(&bbj[c2].lower[0], min_S); - store4(&bbj[c2].upper[0], max_S); + min_S = min(load4(bb[c2*2+0].lower.ptr()), + load4(bb[c2*2+1].lower.ptr())); + max_S = max(load4(bb[c2*2+0].upper.ptr()), + load4(bb[c2*2+1].upper.ptr())); + store4(bbj[c2].lower.ptr(), min_S); + store4(bbj[c2].upper.ptr(), max_S); #else - for (int j = 0; j < NNBSBB_C; j++) - { - bbj[c2].lower[j] = std::min(bb[c2*2 + 0].lower[j], - bb[c2*2 + 1].lower[j]); - bbj[c2].upper[j] = std::max(bb[c2*2 + 0].upper[j], - bb[c2*2 + 1].upper[j]); - } + bbj[c2].lower = BoundingBox::Corner::min(bb[c2*2 + 0].lower, + bb[c2*2 + 1].lower); + bbj[c2].upper = BoundingBox::Corner::max(bb[c2*2 + 0].upper, + bb[c2*2 + 1].upper); #endif } if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1) { /* The bb count in this column is odd: duplicate the last bb */ - for (int j = 0; j < NNBSBB_C; j++) - { - bbj[c2].lower[j] = bb[c2*2].lower[j]; - bbj[c2].upper[j] = bb[c2*2].upper[j]; - } + bbj[c2].lower = bb[c2*2].lower; + bbj[c2].upper = bb[c2*2].upper; } } } @@ -707,15 +700,13 @@ static void combine_bounding_box_pairs(const Grid &grid, static void print_bbsizes_simple(FILE *fp, const Grid &grid) { - dvec ba; - - clear_dvec(ba); + dvec ba = { 0 }; for (int c = 0; c < grid.numCells(); c++) { - for (int d = 0; d < DIM; d++) - { - ba[d] += grid.iBoundingBoxes()[c].upper[d] - grid.iBoundingBoxes()[c].lower[d]; - } + const BoundingBox &bb = grid.iBoundingBoxes()[c]; + ba[XX] += bb.upper.x - bb.lower.x; + ba[YY] += bb.upper.y - bb.lower.y; + ba[ZZ] += bb.upper.z - bb.lower.z; } dsvmul(1.0/grid.numCells(), ba, ba); @@ -760,11 +751,10 @@ static void print_bbsizes_supersub(FILE *fp, #else for (int s = 0; s < grid.numClustersPerCell()[c]; s++) { - int cs = c*c_gpuNumClusterPerCell + s; - for (int d = 0; d < DIM; d++) - { - ba[d] += grid.iBoundingBoxes()[cs].upper[d] - grid.iBoundingBoxes()[cs].lower[d]; - } + const BoundingBox &bb = grid.iBoundingBoxes()[c*c_gpuNumClusterPerCell + s]; + ba[XX] += bb.upper.x - bb.lower.x; + ba[YY] += bb.upper.y - bb.lower.y; + ba[ZZ] += bb.upper.z - bb.lower.z; } #endif ns += grid.numClustersPerCell()[c]; @@ -983,12 +973,12 @@ void Grid::fillCell(nbnxn_search *nbs, int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell); fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n", atomToCluster(atomStart), - bb_[bbo].lower[BB_X], - bb_[bbo].lower[BB_Y], - bb_[bbo].lower[BB_Z], - bb_[bbo].upper[BB_X], - bb_[bbo].upper[BB_Y], - bb_[bbo].upper[BB_Z]); + bb_[bbo].lower.x, + bb_[bbo].lower.y, + bb_[bbo].lower.z, + bb_[bbo].upper.x, + bb_[bbo].upper.y, + bb_[bbo].upper.z); } } } @@ -1049,8 +1039,8 @@ void Grid::sortColumnsCpuGeometry(nbnxn_search *nbs, { cellFilled = cell; } - bbcz_[cell*NNBSBB_D ] = bb_[cellFilled].lower[BB_Z]; - bbcz_[cell*NNBSBB_D + 1] = bb_[cellFilled].upper[BB_Z]; + bbcz_[cell*NNBSBB_D ] = bb_[cellFilled].lower.z; + bbcz_[cell*NNBSBB_D + 1] = bb_[cellFilled].upper.z; } /* Set the unused atom indices to -1 */ diff --git a/src/gromacs/nbnxm/grid.h b/src/gromacs/nbnxm/grid.h index 3b2b324214..35a5266cc4 100644 --- a/src/gromacs/nbnxm/grid.h +++ b/src/gromacs/nbnxm/grid.h @@ -75,30 +75,69 @@ class UpdateGroupsCog; namespace Nbnxm { -#ifndef DOXYGEN +/*! \internal + * \brief Bounding box for a nbnxm atom cluster + * + * \note Should be aligned in memory to enable 4-wide SIMD operations. + */ +struct BoundingBox +{ + /*! \internal + * \brief Corner for the bounding box, padded with one element to enable 4-wide SIMD operations + */ + struct Corner + { + //! Returns a corner with the minimum coordinates along each dimension + static Corner min(const Corner &c1, + const Corner &c2) + { + Corner cMin; -// TODO: Convert macros to constexpr int and replace BB_? indexing by struct with x,y,z + cMin.x = std::min(c1.x, c2.x); + cMin.y = std::min(c1.y, c2.y); + cMin.z = std::min(c1.z, c2.z); + /* This value of the padding is irrelevant, as long as it + * is initialized. We use min to allow auto-vectorization. + */ + cMin.padding = std::min(c1.padding, c2.padding); -/* Pair search box lower and upper corner in x,y,z. - * Store this in 4 iso 3 reals, which is useful with 4-wide SIMD. - * To avoid complicating the code we also use 4 without 4-wide SIMD. - */ -#define NNBSBB_C 4 -/* Pair search box lower and upper bound in z only. */ -#define NNBSBB_D 2 -/* Pair search box lower and upper corner x,y,z indices, entry 3 is unused */ -#define BB_X 0 -#define BB_Y 1 -#define BB_Z 2 + return cMin; + } -#endif // !DOXYGEN + //! Returns a corner with the maximum coordinates along each dimension + static Corner max(const Corner &c1, + const Corner &c2) + { + Corner cMax; + cMax.x = std::max(c1.x, c2.x); + cMax.y = std::max(c1.y, c2.y); + cMax.z = std::max(c1.z, c2.z); + cMax.padding = std::max(c1.padding, c2.padding); -/* Bounding box for a nbnxn atom cluster */ -struct BoundingBox -{ - float lower[NNBSBB_C]; - float upper[NNBSBB_C]; + return cMax; + } + + //! Returns a pointer for SIMD loading of a Corner object + const float *ptr() const + { + return &x; + } + + //! Returns a pointer for SIMD storing of a Corner object + float *ptr() + { + return &x; + } + + float x; //!< x coordinate + float y; //!< y coordinate + float z; //!< z coordinate + float padding; //!< padding, unused, but should be set to avoid operations on unitialized data + }; + + Corner lower; //!< lower, along x and y and z, corner + Corner upper; //!< upper, along x and y and z, corner }; @@ -106,6 +145,9 @@ struct BoundingBox // TODO: Convert macros to constexpr int +/* Pair search box lower and upper bound in z only. */ +#define NNBSBB_D 2 + /* Bounding box calculations are (currently) always in single precision, so * we only need to check for single precision support here. * This uses less (cache-)memory and SIMD is faster, at least on x86. @@ -139,8 +181,8 @@ struct BoundingBox /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */ # define NBNXN_BBXXXX 1 -/* Size of bounding box corners quadruplet */ -# define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB) +/* Size of a quadruplet of bounding boxes, each 2 corners, stored packed */ +# define NNBSBB_XXXX (2*DIM*STRIDE_PBB) #else /* NBNXN_SEARCH_BB_SIMD4 */ diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp index 8e53d05ac0..8fc0dbf55f 100644 --- a/src/gromacs/nbnxm/pairlist.cpp +++ b/src/gromacs/nbnxm/pairlist.cpp @@ -420,20 +420,20 @@ static void get_cell_range(real b0, real b1, d2 = 0; - dl = bx0 - bb->upper[BB_X]; - dh = bb->lower[BB_X] - bx1; + dl = bx0 - bb->upper.x; + dh = bb->lower.x - bx1; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; - dl = by0 - bb->upper[BB_Y]; - dh = bb->lower[BB_Y] - by1; + dl = by0 - bb->upper.y; + dh = bb->lower.y - by1; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; - dl = bz0 - bb->upper[BB_Z]; - dh = bb->lower[BB_Z] - bz1; + dl = bz0 - bb->upper.z; + dh = bb->lower.z - bz1; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; @@ -448,26 +448,26 @@ static float subc_bb_dist2(int si, int csj, gmx::ArrayRef bb_j_all) { - const nbnxn_bb_t *bb_i = bb_i_ci + si; - const nbnxn_bb_t *bb_j = bb_j_all.data() + csj; + const nbnxn_bb_t &bb_i = bb_i_ci[si]; + const nbnxn_bb_t &bb_j = bb_j_all[csj]; float d2 = 0; float dl, dh, dm, dm0; - dl = bb_i->lower[BB_X] - bb_j->upper[BB_X]; - dh = bb_j->lower[BB_X] - bb_i->upper[BB_X]; + dl = bb_i.lower.x - bb_j.upper.x; + dh = bb_j.lower.x - bb_i.upper.x; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; - dl = bb_i->lower[BB_Y] - bb_j->upper[BB_Y]; - dh = bb_j->lower[BB_Y] - bb_i->upper[BB_Y]; + dl = bb_i.lower.y - bb_j.upper.y; + dh = bb_j.lower.y - bb_i.upper.y; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; - dl = bb_i->lower[BB_Z] - bb_j->upper[BB_Z]; - dh = bb_j->lower[BB_Z] - bb_i->upper[BB_Z]; + dl = bb_i.lower.z - bb_j.upper.z; + dh = bb_j.lower.z - bb_i.upper.z; dm = std::max(dl, dh); dm0 = std::max(dm, 0.0f); d2 += dm0*dm0; @@ -493,10 +493,10 @@ static float subc_bb_dist2_simd4(int si, Simd4Float dm_S; Simd4Float dm0_S; - bb_i_S0 = load4(&bb_i_ci[si].lower[0]); - bb_i_S1 = load4(&bb_i_ci[si].upper[0]); - bb_j_S0 = load4(&bb_j_all[csj].lower[0]); - bb_j_S1 = load4(&bb_j_all[csj].upper[0]); + bb_i_S0 = load4(bb_i_ci[si].lower.ptr()); + bb_i_S1 = load4(bb_i_ci[si].upper.ptr()); + bb_j_S0 = load4(bb_j_all[csj].lower.ptr()); + bb_j_S1 = load4(bb_j_all[csj].upper.ptr()); dl_S = bb_i_S0 - bb_j_S1; dh_S = bb_j_S0 - bb_i_S1; @@ -2356,12 +2356,12 @@ static inline void set_icell_bb_simple(gmx::ArrayRef bb, real shx, real shy, real shz, nbnxn_bb_t *bb_ci) { - bb_ci->lower[BB_X] = bb[ci].lower[BB_X] + shx; - bb_ci->lower[BB_Y] = bb[ci].lower[BB_Y] + shy; - bb_ci->lower[BB_Z] = bb[ci].lower[BB_Z] + shz; - bb_ci->upper[BB_X] = bb[ci].upper[BB_X] + shx; - bb_ci->upper[BB_Y] = bb[ci].upper[BB_Y] + shy; - bb_ci->upper[BB_Z] = bb[ci].upper[BB_Z] + shz; + bb_ci->lower.x = bb[ci].lower.x + shx; + bb_ci->lower.y = bb[ci].lower.y + shy; + bb_ci->lower.z = bb[ci].lower.z + shz; + bb_ci->upper.x = bb[ci].upper.x + shx; + bb_ci->upper.y = bb[ci].upper.y + shy; + bb_ci->upper.z = bb[ci].upper.z + shz; } /* Sets a simple list i-cell bounding box, including PBC shift */ @@ -3407,7 +3407,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, { if (bSimple) { - bx1 = bb_i[ci].upper[BB_X]; + bx1 = bb_i[ci].upper.x; } else { @@ -3467,8 +3467,8 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, if (bSimple) { - by0 = bb_i[ci].lower[BB_Y] + shy; - by1 = bb_i[ci].upper[BB_Y] + shy; + by0 = bb_i[ci].lower.y + shy; + by1 = bb_i[ci].upper.y + shy; } else { @@ -3511,8 +3511,8 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, if (bSimple) { - bx0 = bb_i[ci].lower[BB_X] + shx; - bx1 = bb_i[ci].upper[BB_X] + shx; + bx0 = bb_i[ci].lower.x + shx; + bx1 = bb_i[ci].upper.x + shx; } else { -- 2.22.0