From fa5c870e5b46a9e965559316e867278fa7fb5090 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Fri, 6 Mar 2020 20:43:01 +0100
Subject: [PATCH] Require padded atom data for PME GPU

The unpadded case doesn't work and is hard-coded not to run, so we
should remove it.

The constant c_usePadding has been true ever since
379b2954fd3efbd1b2c724964932c2ef03078939 introduced it in 2016. This
piece of unecessary complexity probably comes from prior introduction
of the optimization of padding the atom data arrays so that
device-side loads don't have to check boundaries. That's unlikely to
ever change again, and is unused, untested, and poorly
understood. (It's also currently broken - if c_usePadding == false
would be attempted, it would lead to the alignment getter returning
zero that would be used as a divisor, ie. is broken.)

In some places the words alignment and padding were used
interchangeably, which is confusing and fixed. Now code refers to
block size and that it set the minimum divisor for the memory
allocation. Padding size is a misnomer, because the size of the
padding can be interpreted as the number of the extra array elements.

The resulting code is a bit simpler and easier to understand

Change-Id: I51a28c1c722e3ee5a9f3e1787892d81cb9df00fb
---
 src/gromacs/ewald/CMakeLists.txt              |  1 -
 src/gromacs/ewald/pme.h                       |  7 +-
 src/gromacs/ewald/pme_gather.clh              | 23 ++----
 src/gromacs/ewald/pme_gather.cu               | 31 +++----
 src/gromacs/ewald/pme_gpu.cpp                 |  4 +-
 .../ewald/pme_gpu_calculate_splines.clh       | 16 +---
 .../ewald/pme_gpu_calculate_splines.cuh       | 37 ++-------
 src/gromacs/ewald/pme_gpu_constants.h         | 22 ++---
 src/gromacs/ewald/pme_gpu_internal.cpp        | 81 +++++++------------
 src/gromacs/ewald/pme_gpu_internal.h          | 12 +--
 .../ewald/pme_gpu_program_impl_ocl.cpp        |  7 +-
 src/gromacs/ewald/pme_gpu_types_host.h        | 13 +--
 src/gromacs/ewald/pme_only.cpp                |  6 +-
 src/gromacs/ewald/pme_spread.clh              | 27 +++----
 src/gromacs/ewald/pme_spread.cu               | 22 ++---
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  2 +-
 src/gromacs/mdrun/runner.cpp                  |  3 +-
 .../mdtypes/state_propagator_data_gpu.h       |  8 +-
 .../state_propagator_data_gpu_impl.cpp        |  4 +-
 .../mdtypes/state_propagator_data_gpu_impl.h  | 12 +--
 .../state_propagator_data_gpu_impl_gpu.cpp    | 21 ++---
 21 files changed, 126 insertions(+), 233 deletions(-)

diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index 7583e1db50..a1d3bcf2f7 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -124,7 +124,6 @@ foreach(VENDOR AMD NVIDIA INTEL)
         -Dorder=4
         -DthreadsPerAtom=16
         -Dc_pmeMaxUnitcellShift=2
-        -Dc_usePadding=true
         -Dc_skipNeutralAtoms=false
         -Dc_virialAndEnergyCount=7
         -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 8aa2c079b1..bfc79b88e9 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -285,11 +285,14 @@ inline bool pme_gpu_task_enabled(const gmx_pme_t* pme)
     return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
 }
 
-/*! \brief Returns the size of the padding needed by GPU version of PME in the coordinates array.
+/*! \brief Returns the block size requirement
+ *
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
  *
  * \param[in]  pme  The PME data structure.
  */
-GPU_FUNC_QUALIFIER int pme_gpu_get_padding_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER int pme_gpu_get_block_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(0);
 
 // The following functions are all the PME GPU entry points,
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh
index 10b14dbe09..2d51020763 100644
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -246,9 +246,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     const int localGridlineIndicesIndex = threadLocalId;
     const int globalGridlineIndicesIndex =
             (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
-    const int globalCheckIndices =
-            pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-    if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+    if (localGridlineIndicesIndex < gridlineIndicesSize)
     {
         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -256,9 +254,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
     const int localSplineParamsIndex = threadLocalId;
     const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
-    const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-            globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-    if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+    if (localSplineParamsIndex < splineParamsSize)
     {
         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
@@ -271,10 +267,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     float fy = 0.0F;
     float fz = 0.0F;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         const int nx  = kernelParams.grid.realGridSize[XX];
         const int ny  = kernelParams.grid.realGridSize[YY];
@@ -339,8 +334,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
     const int forceIndexLocal  = threadLocalId;
     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
     {
         const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -376,13 +370,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
         {
             const int outputIndexLocal  = i * iterThreads + threadLocalId;
             const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = sm_forces[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            const float outputForceComponent = sm_forces[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
         }
     }
 }
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index e3d97b9844..e6648086a4 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -287,9 +287,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
         /* Read splines */
         const int localGridlineIndicesIndex = threadLocalId;
         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
-        const int globalCheckIndices         = pme_gpu_check_atom_data_index(
-                globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-        if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+        if (localGridlineIndicesIndex < gridlineIndicesSize)
         {
             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -306,9 +304,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
                     threadLocalId
                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
-            int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-                    globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-            if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+            if (localSplineParamsIndex < splineParamsSize)
             {
                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
@@ -329,10 +325,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             // Coordinates
             __shared__ float3 sm_coordinates[atomsPerBlock];
             /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
             __syncthreads();
             atomX      = sm_coordinates[atomIndexLocal];
             atomCharge = sm_coefficients[atomIndexLocal];
@@ -350,10 +346,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     float fy = 0.0f;
     float fz = 0.0f;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         const int nx  = kernelParams.grid.realGridSize[XX];
         const int ny  = kernelParams.grid.realGridSize[YY];
@@ -424,8 +419,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
     const int forceIndexLocal  = threadLocalId;
     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
     {
         const float3 atomForces     = sm_forces[forceIndexLocal];
         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -453,15 +447,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
 #pragma unroll
         for (int i = 0; i < numIter; i++)
         {
-            int       outputIndexLocal  = i * iterThreads + threadLocalId;
-            int       outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            int         outputIndexLocal     = i * iterThreads + threadLocalId;
+            int         outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
+            const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
         }
     }
 }
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index cbcab23b1a..91596e77d4 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -99,7 +99,7 @@ void pme_gpu_get_timings(const gmx_pme_t* pme, gmx_wallclock_gpu_pme_t* timings)
     }
 }
 
-int pme_gpu_get_padding_size(const gmx_pme_t* pme)
+int pme_gpu_get_block_size(const gmx_pme_t* pme)
 {
 
     if (!pme || !pme_gpu_active(pme))
@@ -108,7 +108,7 @@ int pme_gpu_get_padding_size(const gmx_pme_t* pme)
     }
     else
     {
-        return pme_gpu_get_atom_data_alignment(pme->gpu);
+        return pme_gpu_get_atom_data_block_size();
     }
 }
 
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.clh b/src/gromacs/ewald/pme_gpu_calculate_splines.clh
index 8fb056b12b..6485a62a43 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.clh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.clh
@@ -41,7 +41,7 @@
  * Instead of templated parameters this file expects following defines during compilation:
  * - order - PME interpolation order;
  * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
- * - c_usePadding and c_skipNeutralAtoms - same as in pme_gpu_constants.h.
+ * - c_skipNeutralAtoms - same as in pme_gpu_constants.h.
  *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_ewald
@@ -88,20 +88,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
     return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
 }
 
-/*! \brief
- * A function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding being true.
- */
-inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_t nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
 /*! \brief
  * A function for optionally skipping neutral charges, depending on c_skipNeutralAtoms.
  *
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
index e305359158..08ab82fb51 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -95,21 +95,6 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn
     return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
 }
 
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex        The atom data index.
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
 /*! \internal \brief
  * An inline CUDA function for skipping the zero-charge atoms.
  *
@@ -155,28 +140,23 @@ __device__ inline void assertIsFinite(T arg)
  * General purpose function for loading atom-related data from global to shared memory.
  *
  * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be
+ *                               accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for
+ *                               an rvec coordinates array).
  * \param[out] sm_destination    Shared memory array for output.
  * \param[in]  gm_source         Global memory array for input.
  */
 template<typename T, const int atomsPerBlock, const int dataCountPerAtom>
-__device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
-                                                        T* __restrict__ sm_destination,
+__device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
                                                         const T* __restrict__ gm_source)
 {
-    static_assert(c_usePadding,
-                  "With padding disabled, index checking should be fixed to account for spline "
-                  "theta/dtheta pr-warp alignment");
     const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
     const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
     const int localIndex       = threadLocalIndex;
     const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
     const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
     {
         assertIsFinite(gm_source[globalIndex]);
         sm_destination[localIndex] = gm_source[globalIndex];
@@ -230,8 +210,6 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
     /* Atom index w.r.t. block/shared memory */
     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
     const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
     const int orderIndex      = threadLocalIdXY / DIM;
@@ -244,10 +222,9 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
     float splineData[order];
 
     const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 
     /* we have 4 threads per atom, but can only use 3 here for the dimensions */
-    if (localCheck && globalCheck)
+    if (localCheck)
     {
         /* Indices interpolation */
 
diff --git a/src/gromacs/ewald/pme_gpu_constants.h b/src/gromacs/ewald/pme_gpu_constants.h
index 35299ca6b1..200fafc431 100644
--- a/src/gromacs/ewald/pme_gpu_constants.h
+++ b/src/gromacs/ewald/pme_gpu_constants.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -59,18 +59,6 @@
 
 /* General settings for PME GPU behaviour */
 
-/*! \brief
- * false: The atom data GPU buffers are sized precisely according to the number of atoms.
- *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
- *        The atom index checks in the spread/gather code potentially hinder the performance.
- * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
- *        fitting in is divisible by c_pmeAtomDataAlignment.
- *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
- *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
- * \todo Estimate performance differences
- */
-constexpr bool c_usePadding = true;
-
 /*! \brief
  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
@@ -144,11 +132,11 @@ constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsP
  * Atom data alignment (in terms of number of atoms).
  * This is the least common multiple of number of atoms processed by
  * a single block/workgroup of the spread and gather kernels.
- * If the GPU atom data buffers are padded (c_usePadding == true),
- * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
- * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
+ * The GPU atom data buffers must be padded, which means that
+ * the numbers of atoms used for determining the size of the memory
+ * allocation must be divisible by this.
  */
-constexpr int c_pmeAtomDataAlignment = 64;
+constexpr int c_pmeAtomDataBlockSize = 64;
 
 /*
  * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index ae308bf570..8c61eac86d 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -108,17 +108,9 @@ static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr(const PmeGpu*
     return kernelParamsPtr;
 }
 
-int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
+int pme_gpu_get_atom_data_block_size()
 {
-    // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
-    if (c_usePadding)
-    {
-        return c_pmeAtomDataAlignment;
-    }
-    else
-    {
-        return 0;
-    }
+    return c_pmeAtomDataBlockSize;
 }
 
 int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
@@ -244,15 +236,13 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    if (c_usePadding)
+
+    const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
+    const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
+    if (paddingCount > 0)
     {
-        const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
-        const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
-        if (paddingCount > 0)
-        {
-            clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream_);
-        }
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+                               paddingCount, pmeGpu->archSpecific->pmeStream_);
     }
 }
 
@@ -263,10 +253,8 @@ void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
 
 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
 {
-    const int    order        = pmeGpu->common->pme_order;
-    const int    alignment    = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const int    newSplineDataSize = DIM * order * nAtomsPadded;
+    const int order             = pmeGpu->common->pme_order;
+    const int newSplineDataSize = DIM * order * pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
     /* Two arrays of the same size */
     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
@@ -436,9 +424,7 @@ void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
 
 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
 {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
     copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
                          pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
@@ -451,22 +437,19 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
 
 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
 {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
-    if (c_usePadding)
-    {
-        // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
-                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-    }
+
+    // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
@@ -954,12 +937,10 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
 {
     auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
     kernelParamsPtr->atoms.nAtoms = nAtoms;
-    const int alignment           = pme_gpu_get_atom_data_alignment(pmeGpu);
-    pmeGpu->nAtomsPadded          = ((nAtoms + alignment - 1) / alignment) * alignment;
-    const int  nAtomsAlloc        = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
-    const bool haveToRealloc =
-            (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
-    pmeGpu->nAtomsAlloc = nAtomsAlloc;
+    const int  block_size         = pme_gpu_get_atom_data_block_size();
+    const int  nAtomsNewPadded    = ((nAtoms + block_size - 1) / block_size) * block_size;
+    const bool haveToRealloc      = (pmeGpu->nAtomsAlloc < nAtomsNewPadded);
+    pmeGpu->nAtomsAlloc           = nAtomsNewPadded;
 
 #if GMX_DOUBLE
     GMX_RELEASE_ASSERT(false, "Only single precision supported");
@@ -1159,7 +1140,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     // TODO: test varying block sizes on modern arch-s as well
     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
     //(for spline data mostly)
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                "inconsistent atom data padding vs. spreading block size");
 
     // Ensure that coordinates are ready on the device before launching spread;
@@ -1173,7 +1154,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
         xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
     }
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
     KernelLaunchConfig config;
@@ -1407,10 +1388,10 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
 
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                "inconsistent atom data padding vs. gathering block size");
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
     const int order = pmeGpu->common->pme_order;
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index b515e3b222..93ffa77416 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -88,13 +88,15 @@ enum class GridOrdering
 };
 
 /*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
- * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ * Returns the size of the block size requirement
  *
- * \param[in] pmeGpu            The PME GPU structure.
- * \returns   Number of atoms in a single GPU atom data chunk.
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
+ *
+ * \returns Number of atoms in a single GPU atom data chunk, which
+ * determines a minimum divisior of the size of the memory allocated.
  */
-int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
+int pme_gpu_get_atom_data_block_size();
 
 /*! \libinternal \brief
  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
index 1fa443ee4e..5e82fec6df 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -127,7 +127,6 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                 // forwarding from pme_grid.h, used for spline computation table sizes only
                 "-Dc_pmeMaxUnitcellShift=%f "
                 // forwarding PME behavior constants from pme_gpu_constants.h
-                "-Dc_usePadding=%d "
                 "-Dc_skipNeutralAtoms=%d "
                 "-Dc_virialAndEnergyCount=%d "
                 // forwarding kernel work sizes
@@ -139,9 +138,9 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                 // decomposition parameter placeholders
                 "-DwrapX=true -DwrapY=true ",
                 warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
-                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
-                static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
-                solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
+                c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
+                gatherWorkGroupSize, DIM, XX, YY, ZZ);
         try
         {
             /* TODO when we have a proper MPI-aware logging module,
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h
index acdf24bf6d..481c3a302d 100644
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -152,18 +152,13 @@ struct PmeGpu
     PmeGpuStaging staging;
 
     /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
-     * Used for kernel scheduling.
-     * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
-     * TODO: this and the next member represent a memory allocation/padding properties -
-     * what a container type should do ideally.
-     */
-    int nAtomsPadded;
-    /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
-     * if c_usePadding is true.
+     *
      * Used only as a basic size for almost all the atom data allocations
      * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
-     * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
      * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+     *
+     * TODO: memory allocation/padding properties should be handled by
+     * something like a container
      */
     int nAtomsAlloc;
 
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index fe51deb5fc..b85629b28a 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -645,9 +645,9 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
                 "Device context can not be nullptr when building GPU propagator data object.");
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
-                pme_gpu_get_padding_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
+                                                                 GpuApiCallBehavior::Async,
+                                                                 pme_gpu_get_block_size(pme), wcycle);
     }
 
     clear_nrnb(mynrnb);
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh
index 478bc0acdf..be9bcbbb03 100644
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -73,14 +73,12 @@
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
- * \param[in]  kernelParams      Input PME GPU data in constant memory.
  * \param[out] sm_destination    Local memory array for output.
  * \param[in]  gm_source         Global memory array for input.
  * \param[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
  *
  */
-inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelParams,
-                                    __local float* __restrict__ sm_destination,
+inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
                                     __global const float* __restrict__ gm_source,
                                     const int dataCountPerAtom)
 {
@@ -92,9 +90,7 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
     const int localIndex      = threadLocalIndex;
     const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
     const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
     {
         assert(isfinite(float(gm_source[globalIndex])));
         sm_destination[localIndex] = gm_source[globalIndex];
@@ -147,8 +143,6 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
     /* Atom index w.r.t. block/shared memory */
     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
     const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
     /* Dimension index */
@@ -179,9 +173,8 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
 #    define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
 
     const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 
-    if (localCheck && globalCheck)
+    if (localCheck)
     {
         /* Indices interpolation */
         if (orderIndex == 0)
@@ -350,12 +343,10 @@ gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelP
     const int offy = 0;
     const int offz = 0;
 
-    const int atomIndexLocal  = get_local_id(ZZ);
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = get_local_id(ZZ);
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         // Spline Y/Z coordinates
         const int ithy   = get_local_id(YY);
@@ -445,12 +436,12 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
 
     /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);
+    pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
 
     if (computeSplines)
     {
         /* Staging coordinates */
-        pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates, DIM);
+        pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates, DIM);
 
         barrier(CLK_LOCAL_MEM_FENCE);
         calculate_splines(kernelParams, atomIndexOffset, sm_coordinates, sm_coefficients, sm_theta,
@@ -470,9 +461,9 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
          * as in after running the spline kernel)
          */
         /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data(kernelParams, sm_theta, gm_theta, DIM * order);
+        pme_gpu_stage_atom_data(sm_theta, gm_theta, DIM * order);
         /* Gridline indices - they're actually int and not float, but C99 is angry about overloads */
-        pme_gpu_stage_atom_data(kernelParams, (__local float*)sm_gridlineIndices,
+        pme_gpu_stage_atom_data((__local float*)sm_gridlineIndices,
                                 (__global const float*)gm_gridlineIndices, DIM);
 
         barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index 287bfaec55..9bf3462b1e 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -64,14 +64,12 @@
  * \tparam[in] useOrderThreads      Whether we should use order threads per atom (order*order used if false).
  *
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
- * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
  * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  * \param[in]  sm_theta             Atom spline values in the shared memory.
  */
 template<const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads>
 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
-                                               int                          atomIndexOffset,
                                                const float*                 atomCharge,
                                                const int* __restrict__ sm_gridlineIndices,
                                                const float* __restrict__ sm_theta)
@@ -91,12 +89,10 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern
 
     const int offx = 0, offy = 0, offz = 0; // unused for now
 
-    const int atomIndexLocal  = threadIdx.z;
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = threadIdx.z;
 
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
     {
         // Spline Z coordinates
         const int ithz = threadIdx.x;
@@ -217,8 +213,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     if (c_useAtomDataPrefetch)
     {
         __shared__ float sm_coefficients[atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                         kernelParams.atoms.d_coefficients);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
         __syncthreads();
         atomCharge = sm_coefficients[atomIndexLocal];
     }
@@ -236,7 +231,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
             __shared__ float3 sm_coordinates[atomsPerBlock];
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
             __syncthreads();
             atomX = sm_coordinates[atomIndexLocal];
         }
@@ -255,10 +250,9 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
          * as in after running the spline kernel)
          */
         /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta,
-                                                                   kernelParams.atoms.d_theta);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
         /* Gridline indices */
-        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices,
+        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
                                                          kernelParams.atoms.d_gridlineIndices);
 
         __syncthreads();
@@ -267,8 +261,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges<order, wrapX, wrapY, useOrderThreads>(
-                kernelParams, atomIndexOffset, &atomCharge, sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, &atomCharge,
+                                                             sm_gridlineIndices, sm_theta);
     }
 }
 
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 787f3e9f42..80960b647a 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -168,7 +168,7 @@ std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme
     //       restrict one from using other constructor here.
     return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
                                                     GpuApiCallBehavior::Sync,
-                                                    pme_gpu_get_padding_size(&pme), nullptr);
+                                                    pme_gpu_get_block_size(&pme), nullptr);
 }
 
 //! PME initialization with atom data
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 753a43ab35..eddbe5efa7 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1595,7 +1595,6 @@ int Mdrunner::mdrunner()
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                             : nullptr;
-            const int          paddingSize = pme_gpu_get_padding_size(fr->pmedata);
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
@@ -1604,7 +1603,7 @@ int Mdrunner::mdrunner()
                     "Device context can not be nullptr when building GPU propagator data object.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
                     pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    paddingSize, wcycle);
+                    pme_gpu_get_block_size(fr->pmedata), wcycle);
             fr->stateGpu = stateGpu.get();
         }
 
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 678fa33681..a4f77cbf16 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -105,7 +105,7 @@ public:
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     StatePropagatorDataGpu(const DeviceStream*  pmeStream,
@@ -113,7 +113,7 @@ public:
                            const DeviceStream*  nonLocalStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                            gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
@@ -129,13 +129,13 @@ public:
      *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                            gmx_wallcycle*       wcycle);
 
     //! Move constructor
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index 78b1fd3a4a..68c884f99b 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -59,7 +59,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
                                                const DeviceStream* /* nonLocalStream  */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
     impl_(nullptr)
 {
@@ -68,7 +68,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
     impl_(nullptr)
 {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index 1b2c91d2e2..b057692547 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -104,7 +104,7 @@ public:
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     Impl(const DeviceStream*  pmeStream,
@@ -112,7 +112,7 @@ public:
          const DeviceStream*  nonLocalStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
          gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
@@ -128,13 +128,13 @@ public:
      *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
      *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
      *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
     Impl(const DeviceStream*  pmeStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
          gmx_wallcycle*       wcycle);
 
     ~Impl();
@@ -382,8 +382,8 @@ private:
     const DeviceContext& deviceContext_;
     //! Default GPU calls behavior
     GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
-    //! Padding size for the coordinates buffer
-    int paddingSize_ = 0;
+    //! Required minimum divisor of the allocation size of the coordinates buffer
+    int allocationBlockSizeDivisor_ = 0;
 
     //! Number of local atoms
     int numAtomsLocal_ = -1;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index d0027852ee..fca3ae474d 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -70,11 +70,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                    const DeviceStream*  nonLocalStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                    gmx_wallcycle*       wcycle) :
     deviceContext_(deviceContext),
     transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -135,11 +135,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                    gmx_wallcycle*       wcycle) :
     deviceContext_(deviceContext),
     transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -178,9 +178,10 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     numAtomsAll_   = numAtomsAll;
 
     int numAtomsPadded;
-    if (paddingSize_ > 0)
+    if (allocationBlockSizeDivisor_ > 0)
     {
-        numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
+        numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
+                         * allocationBlockSizeDivisor_;
     }
     else
     {
@@ -550,18 +551,18 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                const DeviceStream*  nonLocalStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }
 
 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }
 
-- 
2.22.0