From 472fb54cc31e44533fcf4d593d6b03ae44440517 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Wed, 13 Mar 2019 10:47:49 +0100 Subject: [PATCH] Change nbnxm macros to constexpr Also added an inline function to clarify the code and avoid code duplication. Change-Id: If68ad4f745a9b6f1c68379af24305a47084b608b --- src/gromacs/nbnxm/grid.cpp | 50 +++++++++++++------------- src/gromacs/nbnxm/grid.h | 33 ++++++++++------- src/gromacs/nbnxm/pairlist.cpp | 61 ++++++++++++++++++-------------- src/gromacs/nbnxm/pairlistwork.h | 2 +- 4 files changed, 81 insertions(+), 65 deletions(-) diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp index 9d7ae9cc84..38818258d5 100644 --- a/src/gromacs/nbnxm/grid.cpp +++ b/src/gromacs/nbnxm/grid.cpp @@ -205,7 +205,7 @@ void Grid::setDimensions(const int ddZone, else { #if NBNXN_BBXXXX - pbb_.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX); + pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell)); #else bb_.resize(maxNumCells*c_gpuNumClusterPerCell); #endif @@ -584,12 +584,12 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb) i += stride; } /* Note: possible double to float conversion here */ - bb[0*STRIDE_PBB] = R2F_D(xl); - bb[1*STRIDE_PBB] = R2F_D(yl); - bb[2*STRIDE_PBB] = R2F_D(zl); - bb[3*STRIDE_PBB] = R2F_U(xh); - bb[4*STRIDE_PBB] = R2F_U(yh); - bb[5*STRIDE_PBB] = R2F_U(zh); + bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl); + bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl); + bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl); + bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh); + bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh); + bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh); } #endif /* NBNXN_BBXXXX */ @@ -629,12 +629,12 @@ static void calc_bounding_box_xxxx_simd4(int na, const float *x, { calc_bounding_box_simd4(na, x, bb_work_aligned); - bb[0*STRIDE_PBB] = bb_work_aligned->lower.x; - bb[1*STRIDE_PBB] = bb_work_aligned->lower.y; - bb[2*STRIDE_PBB] = bb_work_aligned->lower.z; - bb[3*STRIDE_PBB] = bb_work_aligned->upper.x; - bb[4*STRIDE_PBB] = bb_work_aligned->upper.y; - bb[5*STRIDE_PBB] = bb_work_aligned->upper.z; + bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x; + bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y; + bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z; + bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x; + bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y; + bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z; } #endif /* NBNXN_BBXXXX */ @@ -724,16 +724,17 @@ static void print_bbsizes_supersub(FILE *fp, for (int c = 0; c < grid.numCells(); c++) { #if NBNXN_BBXXXX - for (int s = 0; s < grid.numClustersPerCell()[c]; s += STRIDE_PBB) + for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize) { - int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB; - for (int i = 0; i < STRIDE_PBB; i++) + int cs_w = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize; + auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize); + for (int i = 0; i < c_packedBoundingBoxesDimSize; i++) { for (int d = 0; d < DIM; d++) { ba[d] += - grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX + (DIM + d)*STRIDE_PBB + i] - - grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX + d *STRIDE_PBB + i]; + boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] - + boundingBoxes[(0 + d)*c_packedBoundingBoxesDimSize + i]; } } } @@ -925,10 +926,11 @@ void Grid::fillCell(const GridSetData &gridSetData, /* Store the bounding boxes in a format convenient * for SIMD4 calculations: xxxxyyyyzzzz... */ - float *pbb_ptr = + const int clusterIndex = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log); + float *pbb_ptr = pbb_.data() + - ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> (geometry_.numAtomsICluster2Log + STRIDE_PBB_2LOG))*NNBSBB_XXXX + - (((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log) & (STRIDE_PBB - 1)); + packedBoundingBoxesIndex(clusterIndex) + + (clusterIndex & (c_packedBoundingBoxesDimSize - 1)); #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB if (nbat->XFormat == nbatXYZQ) @@ -946,9 +948,9 @@ void Grid::fillCell(const GridSetData &gridSetData, { fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n", atomToCluster(atomStart), - pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB], - pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB], - pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]); + pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize], + pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize], + pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]); } } #endif diff --git a/src/gromacs/nbnxm/grid.h b/src/gromacs/nbnxm/grid.h index fa59bd8fc6..c16f6830e1 100644 --- a/src/gromacs/nbnxm/grid.h +++ b/src/gromacs/nbnxm/grid.h @@ -152,9 +152,9 @@ struct BoundingBox1D /*! \brief The number of bounds along one dimension of a bounding box */ static constexpr int c_numBoundingBoxBounds1D = 2; -#ifndef DOXYGEN +} // namespace Nbnxm -// TODO: Convert macros to constexpr int +#ifndef DOXYGEN /* Bounding box calculations are (currently) always in single precision, so * we only need to check for single precision support here. @@ -162,12 +162,8 @@ static constexpr int c_numBoundingBoxBounds1D = 2; */ #if GMX_SIMD4_HAVE_FLOAT # define NBNXN_SEARCH_BB_SIMD4 1 -/* Memory alignment in bytes as required by SIMD aligned loads/stores */ -# define NBNXN_SEARCH_BB_MEM_ALIGN (GMX_SIMD4_WIDTH*sizeof(float)) #else # define NBNXN_SEARCH_BB_SIMD4 0 -/* No alignment required, but set it so we can call the same routines */ -# define NBNXN_SEARCH_BB_MEM_ALIGN 32 #endif @@ -181,16 +177,25 @@ static constexpr int c_numBoundingBoxBounds1D = 2; # define NBNXN_SEARCH_SIMD4_FLOAT_X_BB 0 # endif -/* The packed bounding box coordinate stride is always set to 4. +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz + * + * The packed bounding box coordinate stride is always set to 4. * With AVX we could use 8, but that turns out not to be faster. */ -# define STRIDE_PBB GMX_SIMD4_WIDTH -# define STRIDE_PBB_2LOG 2 - -/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */ # define NBNXN_BBXXXX 1 -/* Size of a quadruplet of bounding boxes, each 2 corners, stored packed */ -# define NNBSBB_XXXX (STRIDE_PBB*DIM*Nbnxm::c_numBoundingBoxBounds1D) + +//! The number of bounding boxes in a pack, also the size of a pack along one dimension +static constexpr int c_packedBoundingBoxesDimSize = GMX_SIMD4_WIDTH; + +//! Total number of corners (floats) in a pack of bounding boxes +static constexpr int c_packedBoundingBoxesSize = + c_packedBoundingBoxesDimSize*DIM*Nbnxm::c_numBoundingBoxBounds1D; + +//! Returns the starting index of the bouding box pack that contains the given cluster +static constexpr inline int packedBoundingBoxesIndex(int clusterIndex) +{ + return (clusterIndex/c_packedBoundingBoxesDimSize)*c_packedBoundingBoxesSize; +} #else /* NBNXN_SEARCH_BB_SIMD4 */ @@ -201,6 +206,8 @@ static constexpr int c_numBoundingBoxBounds1D = 2; #endif // !DOXYGEN +namespace Nbnxm +{ /*! \internal * \brief Helper struct to pass data that is shared over all grids diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp index cf0c0f2c8b..17cb35706a 100644 --- a/src/gromacs/nbnxm/pairlist.cpp +++ b/src/gromacs/nbnxm/pairlist.cpp @@ -504,16 +504,18 @@ clusterBoundingBoxDistance2_xxxx_simd4_inner(const float *bb_i, const Simd4Float yj_h, const Simd4Float zj_h) { + constexpr int stride = c_packedBoundingBoxesDimSize; + const int shi = boundingBoxStart*Nbnxm::c_numBoundingBoxBounds1D*DIM; const Simd4Float zero = setZero(); - const Simd4Float xi_l = load4(bb_i + shi + 0*STRIDE_PBB); - const Simd4Float yi_l = load4(bb_i + shi + 1*STRIDE_PBB); - const Simd4Float zi_l = load4(bb_i + shi + 2*STRIDE_PBB); - const Simd4Float xi_h = load4(bb_i + shi + 3*STRIDE_PBB); - const Simd4Float yi_h = load4(bb_i + shi + 4*STRIDE_PBB); - const Simd4Float zi_h = load4(bb_i + shi + 5*STRIDE_PBB); + const Simd4Float xi_l = load4(bb_i + shi + 0*stride); + const Simd4Float yi_l = load4(bb_i + shi + 1*stride); + const Simd4Float zi_l = load4(bb_i + shi + 2*stride); + const Simd4Float xi_h = load4(bb_i + shi + 3*stride); + const Simd4Float yi_h = load4(bb_i + shi + 4*stride); + const Simd4Float zi_h = load4(bb_i + shi + 5*stride); const Simd4Float dx_0 = xi_l - xj_h; const Simd4Float dy_0 = yi_l - yj_h; @@ -548,27 +550,29 @@ clusterBoundingBoxDistance2_xxxx_simd4(const float *bb_j, const float *bb_i, float *d2) { + constexpr int stride = c_packedBoundingBoxesDimSize; + // TODO: During SIMDv2 transition only some archs use namespace (remove when done) using namespace gmx; - const Simd4Float xj_l = Simd4Float(bb_j[0*STRIDE_PBB]); - const Simd4Float yj_l = Simd4Float(bb_j[1*STRIDE_PBB]); - const Simd4Float zj_l = Simd4Float(bb_j[2*STRIDE_PBB]); - const Simd4Float xj_h = Simd4Float(bb_j[3*STRIDE_PBB]); - const Simd4Float yj_h = Simd4Float(bb_j[4*STRIDE_PBB]); - const Simd4Float zj_h = Simd4Float(bb_j[5*STRIDE_PBB]); + const Simd4Float xj_l = Simd4Float(bb_j[0*stride]); + const Simd4Float yj_l = Simd4Float(bb_j[1*stride]); + const Simd4Float zj_l = Simd4Float(bb_j[2*stride]); + const Simd4Float xj_h = Simd4Float(bb_j[3*stride]); + const Simd4Float yj_h = Simd4Float(bb_j[4*stride]); + const Simd4Float zj_h = Simd4Float(bb_j[5*stride]); - /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB. + /* Here we "loop" over si (0,stride) from 0 to nsi with step stride. * But as we know the number of iterations is 1 or 2, we unroll manually. */ clusterBoundingBoxDistance2_xxxx_simd4_inner<0>(bb_i, d2, xj_l, yj_l, zj_l, xj_h, yj_h, zj_h); - if (STRIDE_PBB < nsi) + if (stride < nsi) { - clusterBoundingBoxDistance2_xxxx_simd4_inner(bb_i, d2, - xj_l, yj_l, zj_l, - xj_h, yj_h, zj_h); + clusterBoundingBoxDistance2_xxxx_simd4_inner(bb_i, d2, + xj_l, yj_l, zj_l, + xj_h, yj_h, zj_h); } } @@ -1239,7 +1243,8 @@ static void make_cluster_list_supersub(const Grid &iGrid, #if NBNXN_BBXXXX /* Determine all ci1 bb distances in one call with SIMD4 */ - clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + (cj >> STRIDE_PBB_2LOG)*NNBSBB_XXXX + (cj & (STRIDE_PBB-1)), + const int offset = packedBoundingBoxesIndex(cj) + (cj & (c_packedBoundingBoxesDimSize - 1)); + clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + offset, ci1, pbb_ci, d2l); *numDistanceChecks += c_nbnxnGpuClusterSize*2; #endif @@ -2365,17 +2370,19 @@ static void set_icell_bbxxxx_supersub(gmx::ArrayRef bb, real shx, real shy, real shz, float *bb_ci) { - int ia = ci*(c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX; - for (int m = 0; m < (c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX) + constexpr int cellBBStride = packedBoundingBoxesIndex(c_gpuNumClusterPerCell); + constexpr int pbbStride = c_packedBoundingBoxesDimSize; + const int ia = ci*cellBBStride; + for (int m = 0; m < cellBBStride; m += c_packedBoundingBoxesSize) { - for (int i = 0; i < STRIDE_PBB; i++) + for (int i = 0; i < pbbStride; i++) { - bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx; - bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy; - bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz; - bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx; - bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy; - bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz; + bb_ci[m + 0*pbbStride + i] = bb[ia + m + 0*pbbStride + i] + shx; + bb_ci[m + 1*pbbStride + i] = bb[ia + m + 1*pbbStride + i] + shy; + bb_ci[m + 2*pbbStride + i] = bb[ia + m + 2*pbbStride + i] + shz; + bb_ci[m + 3*pbbStride + i] = bb[ia + m + 3*pbbStride + i] + shx; + bb_ci[m + 4*pbbStride + i] = bb[ia + m + 4*pbbStride + i] + shy; + bb_ci[m + 5*pbbStride + i] = bb[ia + m + 5*pbbStride + i] + shz; } } } diff --git a/src/gromacs/nbnxm/pairlistwork.h b/src/gromacs/nbnxm/pairlistwork.h index 35de7acdb6..8716632010 100644 --- a/src/gromacs/nbnxm/pairlistwork.h +++ b/src/gromacs/nbnxm/pairlistwork.h @@ -101,7 +101,7 @@ struct NbnxnPairlistGpuWork ISuperClusterData() : bb(c_gpuNumClusterPerCell), #if NBNXN_SEARCH_BB_SIMD4 - bbPacked(c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX), + bbPacked(c_gpuNumClusterPerCell/c_packedBoundingBoxesDimSize*c_packedBoundingBoxesSize), #endif x(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize*DIM), xSimd(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize*DIM) -- 2.22.0