Require padded atom data for PME GPU
authorMark Abraham <mark.j.abraham@gmail.com>
Fri, 6 Mar 2020 19:43:01 +0000 (20:43 +0100)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 13 Mar 2020 09:35:09 +0000 (10:35 +0100)
The unpadded case doesn't work and is hard-coded not to run, so we
should remove it.

The constant c_usePadding has been true ever since
379b2954fd3efbd1b2c724964932c2ef03078939 introduced it in 2016. This
piece of unecessary complexity probably comes from prior introduction
of the optimization of padding the atom data arrays so that
device-side loads don't have to check boundaries. That's unlikely to
ever change again, and is unused, untested, and poorly
understood. (It's also currently broken - if c_usePadding == false
would be attempted, it would lead to the alignment getter returning
zero that would be used as a divisor, ie. is broken.)

In some places the words alignment and padding were used
interchangeably, which is confusing and fixed. Now code refers to
block size and that it set the minimum divisor for the memory
allocation. Padding size is a misnomer, because the size of the
padding can be interpreted as the number of the extra array elements.

The resulting code is a bit simpler and easier to understand

Change-Id: I51a28c1c722e3ee5a9f3e1787892d81cb9df00fb

21 files changed:
src/gromacs/ewald/CMakeLists.txt
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_gather.clh
src/gromacs/ewald/pme_gather.cu
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_calculate_splines.clh
src/gromacs/ewald/pme_gpu_calculate_splines.cuh
src/gromacs/ewald/pme_gpu_constants.h
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
src/gromacs/ewald/pme_gpu_types_host.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_spread.clh
src/gromacs/ewald/pme_spread.cu
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 7583e1db50edab86143cede8f1a6b9aabf3570d3..a1d3bcf2f7cb7262a1917f74911b1ae808543be4 100644 (file)
@@ -124,7 +124,6 @@ foreach(VENDOR AMD NVIDIA INTEL)
         -Dorder=4
         -DthreadsPerAtom=16
         -Dc_pmeMaxUnitcellShift=2
-        -Dc_usePadding=true
         -Dc_skipNeutralAtoms=false
         -Dc_virialAndEnergyCount=7
         -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
index 8aa2c079b130b2d9047cbb6e21025f0dab37fe0c..bfc79b88e99bfa5ef037fdf64e519b6da385f539 100644 (file)
@@ -285,11 +285,14 @@ inline bool pme_gpu_task_enabled(const gmx_pme_t* pme)
     return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
 }
 
-/*! \brief Returns the size of the padding needed by GPU version of PME in the coordinates array.
+/*! \brief Returns the block size requirement
+ *
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
  *
  * \param[in]  pme  The PME data structure.
  */
-GPU_FUNC_QUALIFIER int pme_gpu_get_padding_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER int pme_gpu_get_block_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(0);
 
 // The following functions are all the PME GPU entry points,
index 10b14dbe09553b783030bb3d8ba235a8ce59d663..2d510207632716cbfcc3ce576279a00f7d3058b3 100644 (file)
@@ -246,9 +246,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     const int localGridlineIndicesIndex = threadLocalId;
     const int globalGridlineIndicesIndex =
             (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
-    const int globalCheckIndices =
-            pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-    if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+    if (localGridlineIndicesIndex < gridlineIndicesSize)
     {
         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -256,9 +254,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
     const int localSplineParamsIndex = threadLocalId;
     const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
-    const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-            globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-    if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+    if (localSplineParamsIndex < splineParamsSize)
     {
         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
@@ -271,10 +267,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     float fy = 0.0F;
     float fz = 0.0F;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         const int nx  = kernelParams.grid.realGridSize[XX];
         const int ny  = kernelParams.grid.realGridSize[YY];
@@ -339,8 +334,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
     const int forceIndexLocal  = threadLocalId;
     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
     {
         const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -376,13 +370,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
         {
             const int outputIndexLocal  = i * iterThreads + threadLocalId;
             const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = sm_forces[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            const float outputForceComponent = sm_forces[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
         }
     }
 }
index e3d97b98445510ed3efcf586309199ad31421bd9..e6648086a44702535243e70df346683876c0fd0e 100644 (file)
@@ -287,9 +287,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
         /* Read splines */
         const int localGridlineIndicesIndex = threadLocalId;
         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
-        const int globalCheckIndices         = pme_gpu_check_atom_data_index(
-                globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-        if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+        if (localGridlineIndicesIndex < gridlineIndicesSize)
         {
             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -306,9 +304,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
                     threadLocalId
                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
-            int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-                    globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-            if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+            if (localSplineParamsIndex < splineParamsSize)
             {
                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
@@ -329,10 +325,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             // Coordinates
             __shared__ float3 sm_coordinates[atomsPerBlock];
             /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
             __syncthreads();
             atomX      = sm_coordinates[atomIndexLocal];
             atomCharge = sm_coefficients[atomIndexLocal];
@@ -350,10 +346,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     float fy = 0.0f;
     float fz = 0.0f;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         const int nx  = kernelParams.grid.realGridSize[XX];
         const int ny  = kernelParams.grid.realGridSize[YY];
@@ -424,8 +419,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
     const int forceIndexLocal  = threadLocalId;
     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
     {
         const float3 atomForces     = sm_forces[forceIndexLocal];
         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -453,15 +447,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
 #pragma unroll
         for (int i = 0; i < numIter; i++)
         {
-            int       outputIndexLocal  = i * iterThreads + threadLocalId;
-            int       outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            int         outputIndexLocal     = i * iterThreads + threadLocalId;
+            int         outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
+            const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
         }
     }
 }
index cbcab23b1a16dce5c7b5a0e9ab6e62d1009df7f1..91596e77d483be31bca315568ae91da3671ea087 100644 (file)
@@ -99,7 +99,7 @@ void pme_gpu_get_timings(const gmx_pme_t* pme, gmx_wallclock_gpu_pme_t* timings)
     }
 }
 
-int pme_gpu_get_padding_size(const gmx_pme_t* pme)
+int pme_gpu_get_block_size(const gmx_pme_t* pme)
 {
 
     if (!pme || !pme_gpu_active(pme))
@@ -108,7 +108,7 @@ int pme_gpu_get_padding_size(const gmx_pme_t* pme)
     }
     else
     {
-        return pme_gpu_get_atom_data_alignment(pme->gpu);
+        return pme_gpu_get_atom_data_block_size();
     }
 }
 
index 8fb056b12b0075fff44cec5b51517aa94e3456fb..6485a62a43e5f81cd11fdb990bda3aeb0531d689 100644 (file)
@@ -41,7 +41,7 @@
  * Instead of templated parameters this file expects following defines during compilation:
  * - order - PME interpolation order;
  * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
- * - c_usePadding and c_skipNeutralAtoms - same as in pme_gpu_constants.h.
+ * - c_skipNeutralAtoms - same as in pme_gpu_constants.h.
  *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_ewald
@@ -88,20 +88,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
     return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
 }
 
-/*! \brief
- * A function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding being true.
- */
-inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_t nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
 /*! \brief
  * A function for optionally skipping neutral charges, depending on c_skipNeutralAtoms.
  *
index e305359158bc6aff18b520dc110dc16adc5c7ca4..08ab82fb51c57b578db5330a6c907ad9ee01cc21 100644 (file)
@@ -95,21 +95,6 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn
     return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
 }
 
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex        The atom data index.
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
 /*! \internal \brief
  * An inline CUDA function for skipping the zero-charge atoms.
  *
@@ -155,28 +140,23 @@ __device__ inline void assertIsFinite(T arg)
  * General purpose function for loading atom-related data from global to shared memory.
  *
  * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be
+ *                               accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for
+ *                               an rvec coordinates array).
  * \param[out] sm_destination    Shared memory array for output.
  * \param[in]  gm_source         Global memory array for input.
  */
 template<typename T, const int atomsPerBlock, const int dataCountPerAtom>
-__device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
-                                                        T* __restrict__ sm_destination,
+__device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
                                                         const T* __restrict__ gm_source)
 {
-    static_assert(c_usePadding,
-                  "With padding disabled, index checking should be fixed to account for spline "
-                  "theta/dtheta pr-warp alignment");
     const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
     const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
     const int localIndex       = threadLocalIndex;
     const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
     const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
     {
         assertIsFinite(gm_source[globalIndex]);
         sm_destination[localIndex] = gm_source[globalIndex];
@@ -230,8 +210,6 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
     /* Atom index w.r.t. block/shared memory */
     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
     const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
     const int orderIndex      = threadLocalIdXY / DIM;
@@ -244,10 +222,9 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
     float splineData[order];
 
     const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 
     /* we have 4 threads per atom, but can only use 3 here for the dimensions */
-    if (localCheck && globalCheck)
+    if (localCheck)
     {
         /* Indices interpolation */
 
index 35299ca6b1198e7e17da5e02039663132a1cd4b5..200fafc43196e47d8209bbae3c425f87f2bfb782 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 /* General settings for PME GPU behaviour */
 
-/*! \brief
- * false: The atom data GPU buffers are sized precisely according to the number of atoms.
- *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
- *        The atom index checks in the spread/gather code potentially hinder the performance.
- * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
- *        fitting in is divisible by c_pmeAtomDataAlignment.
- *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
- *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
- * \todo Estimate performance differences
- */
-constexpr bool c_usePadding = true;
-
 /*! \brief
  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
@@ -144,11 +132,11 @@ constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsP
  * Atom data alignment (in terms of number of atoms).
  * This is the least common multiple of number of atoms processed by
  * a single block/workgroup of the spread and gather kernels.
- * If the GPU atom data buffers are padded (c_usePadding == true),
- * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
- * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
+ * The GPU atom data buffers must be padded, which means that
+ * the numbers of atoms used for determining the size of the memory
+ * allocation must be divisible by this.
  */
-constexpr int c_pmeAtomDataAlignment = 64;
+constexpr int c_pmeAtomDataBlockSize = 64;
 
 /*
  * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
index ae308bf57066aa91853a777d46b6bf96c4881710..8c61eac86dc245ffb93469da79e5009361ab1dac 100644 (file)
@@ -108,17 +108,9 @@ static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr(const PmeGpu*
     return kernelParamsPtr;
 }
 
-int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
+int pme_gpu_get_atom_data_block_size()
 {
-    // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
-    if (c_usePadding)
-    {
-        return c_pmeAtomDataAlignment;
-    }
-    else
-    {
-        return 0;
-    }
+    return c_pmeAtomDataBlockSize;
 }
 
 int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
@@ -244,15 +236,13 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    if (c_usePadding)
+
+    const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
+    const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
+    if (paddingCount > 0)
     {
-        const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
-        const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
-        if (paddingCount > 0)
-        {
-            clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream_);
-        }
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+                               paddingCount, pmeGpu->archSpecific->pmeStream_);
     }
 }
 
@@ -263,10 +253,8 @@ void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
 
 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
 {
-    const int    order        = pmeGpu->common->pme_order;
-    const int    alignment    = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const int    newSplineDataSize = DIM * order * nAtomsPadded;
+    const int order             = pmeGpu->common->pme_order;
+    const int newSplineDataSize = DIM * order * pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
     /* Two arrays of the same size */
     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
@@ -436,9 +424,7 @@ void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
 
 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
 {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
     copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
                          pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
@@ -451,22 +437,19 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
 
 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
 {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
-    if (c_usePadding)
-    {
-        // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
-                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-    }
+
+    // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
@@ -954,12 +937,10 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
 {
     auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
     kernelParamsPtr->atoms.nAtoms = nAtoms;
-    const int alignment           = pme_gpu_get_atom_data_alignment(pmeGpu);
-    pmeGpu->nAtomsPadded          = ((nAtoms + alignment - 1) / alignment) * alignment;
-    const int  nAtomsAlloc        = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
-    const bool haveToRealloc =
-            (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
-    pmeGpu->nAtomsAlloc = nAtomsAlloc;
+    const int  block_size         = pme_gpu_get_atom_data_block_size();
+    const int  nAtomsNewPadded    = ((nAtoms + block_size - 1) / block_size) * block_size;
+    const bool haveToRealloc      = (pmeGpu->nAtomsAlloc < nAtomsNewPadded);
+    pmeGpu->nAtomsAlloc           = nAtomsNewPadded;
 
 #if GMX_DOUBLE
     GMX_RELEASE_ASSERT(false, "Only single precision supported");
@@ -1159,7 +1140,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     // TODO: test varying block sizes on modern arch-s as well
     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
     //(for spline data mostly)
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                "inconsistent atom data padding vs. spreading block size");
 
     // Ensure that coordinates are ready on the device before launching spread;
@@ -1173,7 +1154,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
         xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
     }
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
     KernelLaunchConfig config;
@@ -1407,10 +1388,10 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
 
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                "inconsistent atom data padding vs. gathering block size");
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
     const int order = pmeGpu->common->pme_order;
index b515e3b222d5e2520c31280bf6fb341166480465..93ffa77416a4a3c3ad475c577512db7b4a8c4680 100644 (file)
@@ -88,13 +88,15 @@ enum class GridOrdering
 };
 
 /*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
- * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ * Returns the size of the block size requirement
  *
- * \param[in] pmeGpu            The PME GPU structure.
- * \returns   Number of atoms in a single GPU atom data chunk.
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
+ *
+ * \returns Number of atoms in a single GPU atom data chunk, which
+ * determines a minimum divisior of the size of the memory allocated.
  */
-int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
+int pme_gpu_get_atom_data_block_size();
 
 /*! \libinternal \brief
  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
index 1fa443ee4e916ffd6a00a15b290a0af2aa5bb007..5e82fec6dfa1d58e08105e8b1cedc58c7307e8ae 100644 (file)
@@ -127,7 +127,6 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                 // forwarding from pme_grid.h, used for spline computation table sizes only
                 "-Dc_pmeMaxUnitcellShift=%f "
                 // forwarding PME behavior constants from pme_gpu_constants.h
-                "-Dc_usePadding=%d "
                 "-Dc_skipNeutralAtoms=%d "
                 "-Dc_virialAndEnergyCount=%d "
                 // forwarding kernel work sizes
@@ -139,9 +138,9 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                 // decomposition parameter placeholders
                 "-DwrapX=true -DwrapY=true ",
                 warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
-                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
-                static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
-                solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
+                c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
+                gatherWorkGroupSize, DIM, XX, YY, ZZ);
         try
         {
             /* TODO when we have a proper MPI-aware logging module,
index acdf24bf6d622edbb82a21dd2f244da9fcb91b4a..481c3a302d8151f53f3b613099ff5cfe8637bf52 100644 (file)
@@ -152,18 +152,13 @@ struct PmeGpu
     PmeGpuStaging staging;
 
     /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
-     * Used for kernel scheduling.
-     * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
-     * TODO: this and the next member represent a memory allocation/padding properties -
-     * what a container type should do ideally.
-     */
-    int nAtomsPadded;
-    /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
-     * if c_usePadding is true.
+     *
      * Used only as a basic size for almost all the atom data allocations
      * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
-     * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
      * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+     *
+     * TODO: memory allocation/padding properties should be handled by
+     * something like a container
      */
     int nAtomsAlloc;
 
index fe51deb5fc4dc322b51db151006f6ad4e1eeadc9..b85629b28af6ef4a630573a5b37466e4b1ac062f 100644 (file)
@@ -645,9 +645,9 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
                 "Device context can not be nullptr when building GPU propagator data object.");
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
-                pme_gpu_get_padding_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
+                                                                 GpuApiCallBehavior::Async,
+                                                                 pme_gpu_get_block_size(pme), wcycle);
     }
 
     clear_nrnb(mynrnb);
index 478bc0acdf9ff74cdaaa14382c70e7d3ab3772d6..be9bcbbb03f1715ffce9ba01b577f7244dede4fd 100644 (file)
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
- * \param[in]  kernelParams      Input PME GPU data in constant memory.
  * \param[out] sm_destination    Local memory array for output.
  * \param[in]  gm_source         Global memory array for input.
  * \param[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
  *
  */
-inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelParams,
-                                    __local float* __restrict__ sm_destination,
+inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
                                     __global const float* __restrict__ gm_source,
                                     const int dataCountPerAtom)
 {
@@ -92,9 +90,7 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
     const int localIndex      = threadLocalIndex;
     const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
     const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
     {
         assert(isfinite(float(gm_source[globalIndex])));
         sm_destination[localIndex] = gm_source[globalIndex];
@@ -147,8 +143,6 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
     /* Atom index w.r.t. block/shared memory */
     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
     const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
     /* Dimension index */
@@ -179,9 +173,8 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
 #    define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
 
     const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 
-    if (localCheck && globalCheck)
+    if (localCheck)
     {
         /* Indices interpolation */
         if (orderIndex == 0)
@@ -350,12 +343,10 @@ gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelP
     const int offy = 0;
     const int offz = 0;
 
-    const int atomIndexLocal  = get_local_id(ZZ);
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = get_local_id(ZZ);
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         // Spline Y/Z coordinates
         const int ithy   = get_local_id(YY);
@@ -445,12 +436,12 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
 
     /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);
+    pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
 
     if (computeSplines)
     {
         /* Staging coordinates */
-        pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates, DIM);
+        pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates, DIM);
 
         barrier(CLK_LOCAL_MEM_FENCE);
         calculate_splines(kernelParams, atomIndexOffset, sm_coordinates, sm_coefficients, sm_theta,
@@ -470,9 +461,9 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
          * as in after running the spline kernel)
          */
         /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data(kernelParams, sm_theta, gm_theta, DIM * order);
+        pme_gpu_stage_atom_data(sm_theta, gm_theta, DIM * order);
         /* Gridline indices - they're actually int and not float, but C99 is angry about overloads */
-        pme_gpu_stage_atom_data(kernelParams, (__local float*)sm_gridlineIndices,
+        pme_gpu_stage_atom_data((__local float*)sm_gridlineIndices,
                                 (__global const float*)gm_gridlineIndices, DIM);
 
         barrier(CLK_LOCAL_MEM_FENCE);
index 287bfaec55bde14c2f3aa00c2851a8ed4b10e2f9..9bf3462b1e18283ac5f3cebce18ea55a84fb48b0 100644 (file)
  * \tparam[in] useOrderThreads      Whether we should use order threads per atom (order*order used if false).
  *
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
- * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
  * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  * \param[in]  sm_theta             Atom spline values in the shared memory.
  */
 template<const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads>
 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
-                                               int                          atomIndexOffset,
                                                const float*                 atomCharge,
                                                const int* __restrict__ sm_gridlineIndices,
                                                const float* __restrict__ sm_theta)
@@ -91,12 +89,10 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern
 
     const int offx = 0, offy = 0, offz = 0; // unused for now
 
-    const int atomIndexLocal  = threadIdx.z;
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = threadIdx.z;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         // Spline Z coordinates
         const int ithz = threadIdx.x;
@@ -217,8 +213,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     if (c_useAtomDataPrefetch)
     {
         __shared__ float sm_coefficients[atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                         kernelParams.atoms.d_coefficients);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
         __syncthreads();
         atomCharge = sm_coefficients[atomIndexLocal];
     }
@@ -236,7 +231,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
             __shared__ float3 sm_coordinates[atomsPerBlock];
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
             __syncthreads();
             atomX = sm_coordinates[atomIndexLocal];
         }
@@ -255,10 +250,9 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
          * as in after running the spline kernel)
          */
         /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta,
-                                                                   kernelParams.atoms.d_theta);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
         /* Gridline indices */
-        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices,
+        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
                                                          kernelParams.atoms.d_gridlineIndices);
 
         __syncthreads();
@@ -267,8 +261,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges<order, wrapX, wrapY, useOrderThreads>(
-                kernelParams, atomIndexOffset, &atomCharge, sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, &atomCharge,
+                                                             sm_gridlineIndices, sm_theta);
     }
 }
 
index 787f3e9f424951df85d7fe26d7d72f3ec217112d..80960b647ae94d76d75846eb3549bfac6c74d489 100644 (file)
@@ -168,7 +168,7 @@ std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme
     //       restrict one from using other constructor here.
     return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
                                                     GpuApiCallBehavior::Sync,
-                                                    pme_gpu_get_padding_size(&pme), nullptr);
+                                                    pme_gpu_get_block_size(&pme), nullptr);
 }
 
 //! PME initialization with atom data
index 753a43ab3553aea38a798f3502d496da4fcdc006..eddbe5efa74c4ca241683bd3a6ce11061f2528d6 100644 (file)
@@ -1595,7 +1595,6 @@ int Mdrunner::mdrunner()
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                             : nullptr;
-            const int          paddingSize = pme_gpu_get_padding_size(fr->pmedata);
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
@@ -1604,7 +1603,7 @@ int Mdrunner::mdrunner()
                     "Device context can not be nullptr when building GPU propagator data object.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
                     pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    paddingSize, wcycle);
+                    pme_gpu_get_block_size(fr->pmedata), wcycle);
             fr->stateGpu = stateGpu.get();
         }
 
index 678fa3368151ab335fd984215071ea72f874757b..a4f77cbf164d2fcd36f5d0ae25c934bfb2955f6f 100644 (file)
@@ -105,7 +105,7 @@ public:
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     StatePropagatorDataGpu(const DeviceStream*  pmeStream,
@@ -113,7 +113,7 @@ public:
                            const DeviceStream*  nonLocalStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                            gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
@@ -129,13 +129,13 @@ public:
      *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                            gmx_wallcycle*       wcycle);
 
     //! Move constructor
index 78b1fd3a4ad13962756121af39078114ad7186ee..68c884f99b6e448adba3205219ac68b20e1f978a 100644 (file)
@@ -59,7 +59,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
                                                const DeviceStream* /* nonLocalStream  */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
     impl_(nullptr)
 {
@@ -68,7 +68,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
     impl_(nullptr)
 {
index 1b2c91d2e2bf5ead86a0fca68d9e1095dc92a47e..b0576925479832645669ca9d2fd0e7e6e754ab60 100644 (file)
@@ -104,7 +104,7 @@ public:
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     Impl(const DeviceStream*  pmeStream,
@@ -112,7 +112,7 @@ public:
          const DeviceStream*  nonLocalStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
          gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
@@ -128,13 +128,13 @@ public:
      *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     Impl(const DeviceStream*  pmeStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
          gmx_wallcycle*       wcycle);
 
     ~Impl();
@@ -382,8 +382,8 @@ private:
     const DeviceContext& deviceContext_;
     //! Default GPU calls behavior
     GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
-    //! Padding size for the coordinates buffer
-    int paddingSize_ = 0;
+    //! Required minimum divisor of the allocation size of the coordinates buffer
+    int allocationBlockSizeDivisor_ = 0;
 
     //! Number of local atoms
     int numAtomsLocal_ = -1;
index d0027852eea08101e98810cbfdca251894d2c1cb..fca3ae474d182f193c3d257577b284c3ca7f32f4 100644 (file)
@@ -70,11 +70,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                    const DeviceStream*  nonLocalStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                    gmx_wallcycle*       wcycle) :
     deviceContext_(deviceContext),
     transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -135,11 +135,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                    gmx_wallcycle*       wcycle) :
     deviceContext_(deviceContext),
     transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -178,9 +178,10 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     numAtomsAll_   = numAtomsAll;
 
     int numAtomsPadded;
-    if (paddingSize_ > 0)
+    if (allocationBlockSizeDivisor_ > 0)
     {
-        numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
+        numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
+                         * allocationBlockSizeDivisor_;
     }
     else
     {
@@ -550,18 +551,18 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                const DeviceStream*  nonLocalStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }
 
 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }