else
{
#if NBNXN_BBXXXX
- pbb_.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
+ pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell));
#else
bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
#endif
i += stride;
}
/* Note: possible double to float conversion here */
- bb[0*STRIDE_PBB] = R2F_D(xl);
- bb[1*STRIDE_PBB] = R2F_D(yl);
- bb[2*STRIDE_PBB] = R2F_D(zl);
- bb[3*STRIDE_PBB] = R2F_U(xh);
- bb[4*STRIDE_PBB] = R2F_U(yh);
- bb[5*STRIDE_PBB] = R2F_U(zh);
+ bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl);
+ bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl);
+ bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl);
+ bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh);
+ bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh);
+ bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh);
}
#endif /* NBNXN_BBXXXX */
{
calc_bounding_box_simd4(na, x, bb_work_aligned);
- bb[0*STRIDE_PBB] = bb_work_aligned->lower.x;
- bb[1*STRIDE_PBB] = bb_work_aligned->lower.y;
- bb[2*STRIDE_PBB] = bb_work_aligned->lower.z;
- bb[3*STRIDE_PBB] = bb_work_aligned->upper.x;
- bb[4*STRIDE_PBB] = bb_work_aligned->upper.y;
- bb[5*STRIDE_PBB] = bb_work_aligned->upper.z;
+ bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
+ bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
+ bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
+ bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
+ bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
+ bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
}
#endif /* NBNXN_BBXXXX */
for (int c = 0; c < grid.numCells(); c++)
{
#if NBNXN_BBXXXX
- for (int s = 0; s < grid.numClustersPerCell()[c]; s += STRIDE_PBB)
+ for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
{
- int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
- for (int i = 0; i < STRIDE_PBB; i++)
+ int cs_w = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize;
+ auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
+ for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
{
for (int d = 0; d < DIM; d++)
{
ba[d] +=
- grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX + (DIM + d)*STRIDE_PBB + i] -
- grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX + d *STRIDE_PBB + i];
+ boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] -
+ boundingBoxes[(0 + d)*c_packedBoundingBoxesDimSize + i];
}
}
}
/* Store the bounding boxes in a format convenient
* for SIMD4 calculations: xxxxyyyyzzzz...
*/
- float *pbb_ptr =
+ const int clusterIndex = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log);
+ float *pbb_ptr =
pbb_.data() +
- ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> (geometry_.numAtomsICluster2Log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
- (((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log) & (STRIDE_PBB - 1));
+ packedBoundingBoxesIndex(clusterIndex) +
+ (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
#if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
if (nbat->XFormat == nbatXYZQ)
{
fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
atomToCluster(atomStart),
- pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
- pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
- pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
+ pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize],
+ pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize],
+ pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]);
}
}
#endif
/*! \brief The number of bounds along one dimension of a bounding box */
static constexpr int c_numBoundingBoxBounds1D = 2;
-#ifndef DOXYGEN
+} // namespace Nbnxm
-// TODO: Convert macros to constexpr int
+#ifndef DOXYGEN
/* Bounding box calculations are (currently) always in single precision, so
* we only need to check for single precision support here.
*/
#if GMX_SIMD4_HAVE_FLOAT
# define NBNXN_SEARCH_BB_SIMD4 1
-/* Memory alignment in bytes as required by SIMD aligned loads/stores */
-# define NBNXN_SEARCH_BB_MEM_ALIGN (GMX_SIMD4_WIDTH*sizeof(float))
#else
# define NBNXN_SEARCH_BB_SIMD4 0
-/* No alignment required, but set it so we can call the same routines */
-# define NBNXN_SEARCH_BB_MEM_ALIGN 32
#endif
# define NBNXN_SEARCH_SIMD4_FLOAT_X_BB 0
# endif
-/* The packed bounding box coordinate stride is always set to 4.
+/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz
+ *
+ * The packed bounding box coordinate stride is always set to 4.
* With AVX we could use 8, but that turns out not to be faster.
*/
-# define STRIDE_PBB GMX_SIMD4_WIDTH
-# define STRIDE_PBB_2LOG 2
-
-/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
# define NBNXN_BBXXXX 1
-/* Size of a quadruplet of bounding boxes, each 2 corners, stored packed */
-# define NNBSBB_XXXX (STRIDE_PBB*DIM*Nbnxm::c_numBoundingBoxBounds1D)
+
+//! The number of bounding boxes in a pack, also the size of a pack along one dimension
+static constexpr int c_packedBoundingBoxesDimSize = GMX_SIMD4_WIDTH;
+
+//! Total number of corners (floats) in a pack of bounding boxes
+static constexpr int c_packedBoundingBoxesSize =
+ c_packedBoundingBoxesDimSize*DIM*Nbnxm::c_numBoundingBoxBounds1D;
+
+//! Returns the starting index of the bouding box pack that contains the given cluster
+static constexpr inline int packedBoundingBoxesIndex(int clusterIndex)
+{
+ return (clusterIndex/c_packedBoundingBoxesDimSize)*c_packedBoundingBoxesSize;
+}
#else /* NBNXN_SEARCH_BB_SIMD4 */
#endif // !DOXYGEN
+namespace Nbnxm
+{
/*! \internal
* \brief Helper struct to pass data that is shared over all grids
const Simd4Float yj_h,
const Simd4Float zj_h)
{
+ constexpr int stride = c_packedBoundingBoxesDimSize;
+
const int shi = boundingBoxStart*Nbnxm::c_numBoundingBoxBounds1D*DIM;
const Simd4Float zero = setZero();
- const Simd4Float xi_l = load4(bb_i + shi + 0*STRIDE_PBB);
- const Simd4Float yi_l = load4(bb_i + shi + 1*STRIDE_PBB);
- const Simd4Float zi_l = load4(bb_i + shi + 2*STRIDE_PBB);
- const Simd4Float xi_h = load4(bb_i + shi + 3*STRIDE_PBB);
- const Simd4Float yi_h = load4(bb_i + shi + 4*STRIDE_PBB);
- const Simd4Float zi_h = load4(bb_i + shi + 5*STRIDE_PBB);
+ const Simd4Float xi_l = load4(bb_i + shi + 0*stride);
+ const Simd4Float yi_l = load4(bb_i + shi + 1*stride);
+ const Simd4Float zi_l = load4(bb_i + shi + 2*stride);
+ const Simd4Float xi_h = load4(bb_i + shi + 3*stride);
+ const Simd4Float yi_h = load4(bb_i + shi + 4*stride);
+ const Simd4Float zi_h = load4(bb_i + shi + 5*stride);
const Simd4Float dx_0 = xi_l - xj_h;
const Simd4Float dy_0 = yi_l - yj_h;
const float *bb_i,
float *d2)
{
+ constexpr int stride = c_packedBoundingBoxesDimSize;
+
// TODO: During SIMDv2 transition only some archs use namespace (remove when done)
using namespace gmx;
- const Simd4Float xj_l = Simd4Float(bb_j[0*STRIDE_PBB]);
- const Simd4Float yj_l = Simd4Float(bb_j[1*STRIDE_PBB]);
- const Simd4Float zj_l = Simd4Float(bb_j[2*STRIDE_PBB]);
- const Simd4Float xj_h = Simd4Float(bb_j[3*STRIDE_PBB]);
- const Simd4Float yj_h = Simd4Float(bb_j[4*STRIDE_PBB]);
- const Simd4Float zj_h = Simd4Float(bb_j[5*STRIDE_PBB]);
+ const Simd4Float xj_l = Simd4Float(bb_j[0*stride]);
+ const Simd4Float yj_l = Simd4Float(bb_j[1*stride]);
+ const Simd4Float zj_l = Simd4Float(bb_j[2*stride]);
+ const Simd4Float xj_h = Simd4Float(bb_j[3*stride]);
+ const Simd4Float yj_h = Simd4Float(bb_j[4*stride]);
+ const Simd4Float zj_h = Simd4Float(bb_j[5*stride]);
- /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
+ /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
* But as we know the number of iterations is 1 or 2, we unroll manually.
*/
clusterBoundingBoxDistance2_xxxx_simd4_inner<0>(bb_i, d2,
xj_l, yj_l, zj_l,
xj_h, yj_h, zj_h);
- if (STRIDE_PBB < nsi)
+ if (stride < nsi)
{
- clusterBoundingBoxDistance2_xxxx_simd4_inner<STRIDE_PBB>(bb_i, d2,
- xj_l, yj_l, zj_l,
- xj_h, yj_h, zj_h);
+ clusterBoundingBoxDistance2_xxxx_simd4_inner<stride>(bb_i, d2,
+ xj_l, yj_l, zj_l,
+ xj_h, yj_h, zj_h);
}
}
#if NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SIMD4 */
- clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + (cj >> STRIDE_PBB_2LOG)*NNBSBB_XXXX + (cj & (STRIDE_PBB-1)),
+ const int offset = packedBoundingBoxesIndex(cj) + (cj & (c_packedBoundingBoxesDimSize - 1));
+ clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + offset,
ci1, pbb_ci, d2l);
*numDistanceChecks += c_nbnxnGpuClusterSize*2;
#endif
real shx, real shy, real shz,
float *bb_ci)
{
- int ia = ci*(c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX;
- for (int m = 0; m < (c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
+ constexpr int cellBBStride = packedBoundingBoxesIndex(c_gpuNumClusterPerCell);
+ constexpr int pbbStride = c_packedBoundingBoxesDimSize;
+ const int ia = ci*cellBBStride;
+ for (int m = 0; m < cellBBStride; m += c_packedBoundingBoxesSize)
{
- for (int i = 0; i < STRIDE_PBB; i++)
+ for (int i = 0; i < pbbStride; i++)
{
- bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
- bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
- bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
- bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
- bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
- bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
+ bb_ci[m + 0*pbbStride + i] = bb[ia + m + 0*pbbStride + i] + shx;
+ bb_ci[m + 1*pbbStride + i] = bb[ia + m + 1*pbbStride + i] + shy;
+ bb_ci[m + 2*pbbStride + i] = bb[ia + m + 2*pbbStride + i] + shz;
+ bb_ci[m + 3*pbbStride + i] = bb[ia + m + 3*pbbStride + i] + shx;
+ bb_ci[m + 4*pbbStride + i] = bb[ia + m + 4*pbbStride + i] + shy;
+ bb_ci[m + 5*pbbStride + i] = bb[ia + m + 5*pbbStride + i] + shz;
}
}
}
ISuperClusterData() :
bb(c_gpuNumClusterPerCell),
#if NBNXN_SEARCH_BB_SIMD4
- bbPacked(c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX),
+ bbPacked(c_gpuNumClusterPerCell/c_packedBoundingBoxesDimSize*c_packedBoundingBoxesSize),
#endif
x(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize*DIM),
xSimd(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize*DIM)