Require padded atom data for PME GPU

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 6 Mar 2020 19:43:01 +0000 (20:43 +0100)

committer Artem Zhmurov <zhmurov@gmail.com>

Fri, 13 Mar 2020 09:35:09 +0000 (10:35 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 6 Mar 2020 19:43:01 +0000 (20:43 +0100)
committer Artem Zhmurov <zhmurov@gmail.com>
Fri, 13 Mar 2020 09:35:09 +0000 (10:35 +0100)
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt

index 7583e1db50edab86143cede8f1a6b9aabf3570d3..a1d3bcf2f7cb7262a1917f74911b1ae808543be4 100644 (file)
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -124,7 +124,6 @@ foreach(VENDOR AMD NVIDIA INTEL)
          -Dorder=4
          -DthreadsPerAtom=16
          -Dc_pmeMaxUnitcellShift=2
-        -Dc_usePadding=true
          -Dc_skipNeutralAtoms=false
          -Dc_virialAndEnergyCount=7
          -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 8aa2c079b130b2d9047cbb6e21025f0dab37fe0c..bfc79b88e99bfa5ef037fdf64e519b6da385f539 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -285,11 +285,14 @@ inline bool pme_gpu_task_enabled(const gmx_pme_t* pme)
      return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
  }
  
-/*! \brief Returns the size of the padding needed by GPU version of PME in the coordinates array.
+/*! \brief Returns the block size requirement
+ *
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
   *
   * \param[in]  pme  The PME data structure.
   */
-GPU_FUNC_QUALIFIER int pme_gpu_get_padding_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER int pme_gpu_get_block_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(0);
  
  // The following functions are all the PME GPU entry points,
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh

index 10b14dbe09553b783030bb3d8ba235a8ce59d663..2d510207632716cbfcc3ce576279a00f7d3058b3 100644 (file)
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -246,9 +246,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      const int localGridlineIndicesIndex = threadLocalId;
      const int globalGridlineIndicesIndex =
              (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
-    const int globalCheckIndices =
-            pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-    if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+    if (localGridlineIndicesIndex < gridlineIndicesSize)
      {
          sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
          assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -256,9 +254,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
      const int localSplineParamsIndex = threadLocalId;
      const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
-    const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-            globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-    if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+    if (localSplineParamsIndex < splineParamsSize)
      {
          sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
          sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
@@ -271,10 +267,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      float fy = 0.0F;
      float fz = 0.0F;
  
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
      const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
  
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
      {
          const int nx  = kernelParams.grid.realGridSize[XX];
          const int ny  = kernelParams.grid.realGridSize[YY];
@@ -339,8 +334,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      /* Calculating the final forces with no component branching, atomsPerBlock threads */
      const int forceIndexLocal  = threadLocalId;
      const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
      {
          const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
          const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -376,13 +370,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
          {
              const int outputIndexLocal  = i * iterThreads + threadLocalId;
              const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = sm_forces[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            const float outputForceComponent = sm_forces[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
          }
      }
  }
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index e3d97b98445510ed3efcf586309199ad31421bd9..e6648086a44702535243e70df346683876c0fd0e 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -287,9 +287,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
          /* Read splines */
          const int localGridlineIndicesIndex = threadLocalId;
          const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
-        const int globalCheckIndices         = pme_gpu_check_atom_data_index(
-                globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-        if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+        if (localGridlineIndicesIndex < gridlineIndicesSize)
          {
              sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
              assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
@@ -306,9 +304,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
                      threadLocalId
                      + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
              int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
-            int globalCheckSplineParams = pme_gpu_check_atom_data_index(
-                    globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-            if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+            if (localSplineParamsIndex < splineParamsSize)
              {
                  sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
                  sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
@@ -329,10 +325,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
              // Coordinates
              __shared__ float3 sm_coordinates[atomsPerBlock];
              /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
  
              /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
              __syncthreads();
              atomX      = sm_coordinates[atomIndexLocal];
              atomCharge = sm_coefficients[atomIndexLocal];
@@ -350,10 +346,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
      float fy = 0.0f;
      float fz = 0.0f;
  
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
      const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
  
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
      {
          const int nx  = kernelParams.grid.realGridSize[XX];
          const int ny  = kernelParams.grid.realGridSize[YY];
@@ -424,8 +419,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
      /* Calculating the final forces with no component branching, atomsPerBlock threads */
      const int forceIndexLocal  = threadLocalId;
      const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
-    const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
-    if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+    if (forceIndexLocal < atomsPerBlock)
      {
          const float3 atomForces     = sm_forces[forceIndexLocal];
          const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
@@ -453,15 +447,10 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
  #pragma unroll
          for (int i = 0; i < numIter; i++)
          {
-            int       outputIndexLocal  = i * iterThreads + threadLocalId;
-            int       outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
-            const int globalOutputCheck =
-                    pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
-            if (globalOutputCheck)
-            {
-                const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
-                gm_forces[outputIndexGlobal]     = outputForceComponent;
-            }
+            int         outputIndexLocal     = i * iterThreads + threadLocalId;
+            int         outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
+            const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+            gm_forces[outputIndexGlobal]     = outputForceComponent;
          }
      }
  }
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index cbcab23b1a16dce5c7b5a0e9ab6e62d1009df7f1..91596e77d483be31bca315568ae91da3671ea087 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -99,7 +99,7 @@ void pme_gpu_get_timings(const gmx_pme_t* pme, gmx_wallclock_gpu_pme_t* timings)
      }
  }
  
-int pme_gpu_get_padding_size(const gmx_pme_t* pme)
+int pme_gpu_get_block_size(const gmx_pme_t* pme)
  {
  
      if (!pme || !pme_gpu_active(pme))
@@ -108,7 +108,7 @@ int pme_gpu_get_padding_size(const gmx_pme_t* pme)
      }
      else
      {
-        return pme_gpu_get_atom_data_alignment(pme->gpu);
+        return pme_gpu_get_atom_data_block_size();
      }
  }
  
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.clh b/src/gromacs/ewald/pme_gpu_calculate_splines.clh

index 8fb056b12b0075fff44cec5b51517aa94e3456fb..6485a62a43e5f81cd11fdb990bda3aeb0531d689 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.clh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.clh
@@ -41,7 +41,7 @@
   * Instead of templated parameters this file expects following defines during compilation:
   * - order - PME interpolation order;
   * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
- * - c_usePadding and c_skipNeutralAtoms - same as in pme_gpu_constants.h.
+ * - c_skipNeutralAtoms - same as in pme_gpu_constants.h.
   *
   * \author Aleksei Iupinov <a.yupinov@gmail.com>
   * \ingroup module_ewald
@@ -88,20 +88,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
      return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
  }
  
-/*! \brief
- * A function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding being true.
- */
-inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_t nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
  /*! \brief
   * A function for optionally skipping neutral charges, depending on c_skipNeutralAtoms.
   *
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh

index e305359158bc6aff18b520dc110dc16adc5c7ca4..08ab82fb51c57b578db5330a6c907ad9ee01cc21 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -95,21 +95,6 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn
      return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
  }
  
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex        The atom data index.
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
  /*! \internal \brief
   * An inline CUDA function for skipping the zero-charge atoms.
   *
@@ -155,28 +140,23 @@ __device__ inline void assertIsFinite(T arg)
   * General purpose function for loading atom-related data from global to shared memory.
   *
   * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be
+ *                               accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for
+ *                               an rvec coordinates array).
   * \param[out] sm_destination    Shared memory array for output.
   * \param[in]  gm_source         Global memory array for input.
   */
  template<typename T, const int atomsPerBlock, const int dataCountPerAtom>
-__device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
-                                                        T* __restrict__ sm_destination,
+__device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
                                                          const T* __restrict__ gm_source)
  {
-    static_assert(c_usePadding,
-                  "With padding disabled, index checking should be fixed to account for spline "
-                  "theta/dtheta pr-warp alignment");
      const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
      const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
      const int localIndex       = threadLocalIndex;
      const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
      const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
      {
          assertIsFinite(gm_source[globalIndex]);
          sm_destination[localIndex] = gm_source[globalIndex];
@@ -230,8 +210,6 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
      /* Atom index w.r.t. block/shared memory */
      const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
  
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
      /* Spline contribution index in one dimension */
      const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
      const int orderIndex      = threadLocalIdXY / DIM;
@@ -244,10 +222,9 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
      float splineData[order];
  
      const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
  
      /* we have 4 threads per atom, but can only use 3 here for the dimensions */
-    if (localCheck && globalCheck)
+    if (localCheck)
      {
          /* Indices interpolation */
  
diff --git a/src/gromacs/ewald/pme_gpu_constants.h b/src/gromacs/ewald/pme_gpu_constants.h

index 35299ca6b1198e7e17da5e02039663132a1cd4b5..200fafc43196e47d8209bbae3c425f87f2bfb782 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_constants.h
+++ b/src/gromacs/ewald/pme_gpu_constants.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -59,18 +59,6 @@
  
  /* General settings for PME GPU behaviour */
  
-/*! \brief
- * false: The atom data GPU buffers are sized precisely according to the number of atoms.
- *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
- *        The atom index checks in the spread/gather code potentially hinder the performance.
- * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
- *        fitting in is divisible by c_pmeAtomDataAlignment.
- *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
- *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
- * \todo Estimate performance differences
- */
-constexpr bool c_usePadding = true;
-
  /*! \brief
   * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
   * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
@@ -144,11 +132,11 @@ constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsP
   * Atom data alignment (in terms of number of atoms).
   * This is the least common multiple of number of atoms processed by
   * a single block/workgroup of the spread and gather kernels.
- * If the GPU atom data buffers are padded (c_usePadding == true),
- * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
- * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
+ * The GPU atom data buffers must be padded, which means that
+ * the numbers of atoms used for determining the size of the memory
+ * allocation must be divisible by this.
   */
-constexpr int c_pmeAtomDataAlignment = 64;
+constexpr int c_pmeAtomDataBlockSize = 64;
  
  /*
   * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index ae308bf57066aa91853a777d46b6bf96c4881710..8c61eac86dc245ffb93469da79e5009361ab1dac 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -108,17 +108,9 @@ static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr(const PmeGpu*
      return kernelParamsPtr;
  }
  
-int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
+int pme_gpu_get_atom_data_block_size()
  {
-    // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
-    if (c_usePadding)
-    {
-        return c_pmeAtomDataAlignment;
-    }
-    else
-    {
-        return 0;
-    }
+    return c_pmeAtomDataBlockSize;
  }
  
  int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
@@ -244,15 +236,13 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                         const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    if (c_usePadding)
+
+    const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
+    const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
+    if (paddingCount > 0)
      {
-        const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
-        const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
-        if (paddingCount > 0)
-        {
-            clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream_);
-        }
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+                               paddingCount, pmeGpu->archSpecific->pmeStream_);
      }
  }
  
@@ -263,10 +253,8 @@ void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
  
  void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
  {
-    const int    order        = pmeGpu->common->pme_order;
-    const int    alignment    = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const int    newSplineDataSize = DIM * order * nAtomsPadded;
+    const int order             = pmeGpu->common->pme_order;
+    const int newSplineDataSize = DIM * order * pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
      /* Two arrays of the same size */
      const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
@@ -436,9 +424,7 @@ void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
  
  void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
  {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
      auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
      copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
                           pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
@@ -451,22 +437,19 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
  
  void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
  {
-    const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
-    const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+    const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
      auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
-    if (c_usePadding)
-    {
-        // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
-                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-        clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
-                               pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream_);
-    }
+
+    // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+                           pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+                           pmeGpu->archSpecific->pmeStream_);
+
      copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
@@ -954,12 +937,10 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
  {
      auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
      kernelParamsPtr->atoms.nAtoms = nAtoms;
-    const int alignment           = pme_gpu_get_atom_data_alignment(pmeGpu);
-    pmeGpu->nAtomsPadded          = ((nAtoms + alignment - 1) / alignment) * alignment;
-    const int  nAtomsAlloc        = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
-    const bool haveToRealloc =
-            (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
-    pmeGpu->nAtomsAlloc = nAtomsAlloc;
+    const int  block_size         = pme_gpu_get_atom_data_block_size();
+    const int  nAtomsNewPadded    = ((nAtoms + block_size - 1) / block_size) * block_size;
+    const bool haveToRealloc      = (pmeGpu->nAtomsAlloc < nAtomsNewPadded);
+    pmeGpu->nAtomsAlloc           = nAtomsNewPadded;
  
  #if GMX_DOUBLE
      GMX_RELEASE_ASSERT(false, "Only single precision supported");
@@ -1159,7 +1140,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      // TODO: test varying block sizes on modern arch-s as well
      // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
      //(for spline data mostly)
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                 "inconsistent atom data padding vs. spreading block size");
  
      // Ensure that coordinates are ready on the device before launching spread;
@@ -1173,7 +1154,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
          xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
      }
  
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
      auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
      KernelLaunchConfig config;
@@ -1407,10 +1388,10 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
      const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
                                                       : blockSize / c_pmeSpreadGatherThreadsPerAtom;
  
-    GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+    GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
                 "inconsistent atom data padding vs. gathering block size");
  
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
      auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
      const int order = pmeGpu->common->pme_order;
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index b515e3b222d5e2520c31280bf6fb341166480465..93ffa77416a4a3c3ad475c577512db7b4a8c4680 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -88,13 +88,15 @@ enum class GridOrdering
  };
  
  /*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
- * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ * Returns the size of the block size requirement
   *
- * \param[in] pmeGpu            The PME GPU structure.
- * \returns   Number of atoms in a single GPU atom data chunk.
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
+ *
+ * \returns Number of atoms in a single GPU atom data chunk, which
+ * determines a minimum divisior of the size of the memory allocated.
   */
-int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
+int pme_gpu_get_atom_data_block_size();
  
  /*! \libinternal \brief
   * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp

index 1fa443ee4e916ffd6a00a15b290a0af2aa5bb007..5e82fec6dfa1d58e08105e8b1cedc58c7307e8ae 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -127,7 +127,6 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                  // forwarding from pme_grid.h, used for spline computation table sizes only
                  "-Dc_pmeMaxUnitcellShift=%f "
                  // forwarding PME behavior constants from pme_gpu_constants.h
-                "-Dc_usePadding=%d "
                  "-Dc_skipNeutralAtoms=%d "
                  "-Dc_virialAndEnergyCount=%d "
                  // forwarding kernel work sizes
@@ -139,9 +138,9 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                  // decomposition parameter placeholders
                  "-DwrapX=true -DwrapY=true ",
                  warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
-                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
-                static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
-                solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
+                c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
+                gatherWorkGroupSize, DIM, XX, YY, ZZ);
          try
          {
              /* TODO when we have a proper MPI-aware logging module,
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h

index acdf24bf6d622edbb82a21dd2f244da9fcb91b4a..481c3a302d8151f53f3b613099ff5cfe8637bf52 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -152,18 +152,13 @@ struct PmeGpu
      PmeGpuStaging staging;
  
      /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
-     * Used for kernel scheduling.
-     * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
-     * TODO: this and the next member represent a memory allocation/padding properties -
-     * what a container type should do ideally.
-     */
-    int nAtomsPadded;
-    /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
-     * if c_usePadding is true.
+     *
       * Used only as a basic size for almost all the atom data allocations
       * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
-     * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
       * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+     *
+     * TODO: memory allocation/padding properties should be handled by
+     * something like a container
       */
      int nAtomsAlloc;
  
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index fe51deb5fc4dc322b51db151006f6ad4e1eeadc9..b85629b28af6ef4a630573a5b37466e4b1ac062f 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -645,9 +645,9 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
                  "Device context can not be nullptr when building GPU propagator data object.");
          // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
          //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
-                pme_gpu_get_padding_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
+                                                                 GpuApiCallBehavior::Async,
+                                                                 pme_gpu_get_block_size(pme), wcycle);
      }
  
      clear_nrnb(mynrnb);
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh

index 478bc0acdf9ff74cdaaa14382c70e7d3ab3772d6..be9bcbbb03f1715ffce9ba01b577f7244dede4fd 100644 (file)
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -73,14 +73,12 @@
  /*! \brief
   * General purpose function for loading atom-related data from global to shared memory.
   *
- * \param[in]  kernelParams      Input PME GPU data in constant memory.
   * \param[out] sm_destination    Local memory array for output.
   * \param[in]  gm_source         Global memory array for input.
   * \param[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
   *
   */
-inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelParams,
-                                    __local float* __restrict__ sm_destination,
+inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
                                      __global const float* __restrict__ gm_source,
                                      const int dataCountPerAtom)
  {
@@ -92,9 +90,7 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
      const int localIndex      = threadLocalIndex;
      const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
      const int globalIndex     = globalIndexBase + localIndex;
-    const int globalCheck =
-            pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    if (localIndex < atomsPerBlock * dataCountPerAtom)
      {
          assert(isfinite(float(gm_source[globalIndex])));
          sm_destination[localIndex] = gm_source[globalIndex];
@@ -147,8 +143,6 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
      /* Atom index w.r.t. block/shared memory */
      const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
  
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
      /* Spline contribution index in one dimension */
      const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
      /* Dimension index */
@@ -179,9 +173,8 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
  #    define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
  
      const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
  
-    if (localCheck && globalCheck)
+    if (localCheck)
      {
          /* Indices interpolation */
          if (orderIndex == 0)
@@ -350,12 +343,10 @@ gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelP
      const int offy = 0;
      const int offz = 0;
  
-    const int atomIndexLocal  = get_local_id(ZZ);
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = get_local_id(ZZ);
  
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
      const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
      {
          // Spline Y/Z coordinates
          const int ithy   = get_local_id(YY);
@@ -445,12 +436,12 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
      const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
  
      /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);
+    pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
  
      if (computeSplines)
      {
          /* Staging coordinates */
-        pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates, DIM);
+        pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates, DIM);
  
          barrier(CLK_LOCAL_MEM_FENCE);
          calculate_splines(kernelParams, atomIndexOffset, sm_coordinates, sm_coefficients, sm_theta,
@@ -470,9 +461,9 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
           * as in after running the spline kernel)
           */
          /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data(kernelParams, sm_theta, gm_theta, DIM * order);
+        pme_gpu_stage_atom_data(sm_theta, gm_theta, DIM * order);
          /* Gridline indices - they're actually int and not float, but C99 is angry about overloads */
-        pme_gpu_stage_atom_data(kernelParams, (__local float*)sm_gridlineIndices,
+        pme_gpu_stage_atom_data((__local float*)sm_gridlineIndices,
                                  (__global const float*)gm_gridlineIndices, DIM);
  
          barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index 287bfaec55bde14c2f3aa00c2851a8ed4b10e2f9..9bf3462b1e18283ac5f3cebce18ea55a84fb48b0 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -64,14 +64,12 @@
   * \tparam[in] useOrderThreads      Whether we should use order threads per atom (order*order used if false).
   *
   * \param[in]  kernelParams         Input PME CUDA data in constant memory.
- * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
   * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
   * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
   * \param[in]  sm_theta             Atom spline values in the shared memory.
   */
  template<const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads>
  __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
-                                               int                          atomIndexOffset,
                                                 const float*                 atomCharge,
                                                 const int* __restrict__ sm_gridlineIndices,
                                                 const float* __restrict__ sm_theta)
@@ -91,12 +89,10 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern
  
      const int offx = 0, offy = 0, offz = 0; // unused for now
  
-    const int atomIndexLocal  = threadIdx.z;
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    const int atomIndexLocal = threadIdx.z;
  
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
      const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
-    if (chargeCheck & globalCheck)
+    if (chargeCheck)
      {
          // Spline Z coordinates
          const int ithz = threadIdx.x;
@@ -217,8 +213,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
      if (c_useAtomDataPrefetch)
      {
          __shared__ float sm_coefficients[atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                         kernelParams.atoms.d_coefficients);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
          __syncthreads();
          atomCharge = sm_coefficients[atomIndexLocal];
      }
@@ -236,7 +231,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
              __shared__ float3 sm_coordinates[atomsPerBlock];
  
              /* Staging coordinates */
-            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
              __syncthreads();
              atomX = sm_coordinates[atomIndexLocal];
          }
@@ -255,10 +250,9 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
           * as in after running the spline kernel)
           */
          /* Spline data - only thetas (dthetas will only be needed in gather) */
-        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta,
-                                                                   kernelParams.atoms.d_theta);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
          /* Gridline indices */
-        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices,
+        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
                                                           kernelParams.atoms.d_gridlineIndices);
  
          __syncthreads();
@@ -267,8 +261,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
      /* Spreading */
      if (spreadCharges)
      {
-        spread_charges<order, wrapX, wrapY, useOrderThreads>(
-                kernelParams, atomIndexOffset, &atomCharge, sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, &atomCharge,
+                                                             sm_gridlineIndices, sm_theta);
      }
  }
  
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index 787f3e9f424951df85d7fe26d7d72f3ec217112d..80960b647ae94d76d75846eb3549bfac6c74d489 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -168,7 +168,7 @@ std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme
      //       restrict one from using other constructor here.
      return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
                                                      GpuApiCallBehavior::Sync,
-                                                    pme_gpu_get_padding_size(&pme), nullptr);
+                                                    pme_gpu_get_block_size(&pme), nullptr);
  }
  
  //! PME initialization with atom data
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 753a43ab3553aea38a798f3502d496da4fcdc006..eddbe5efa74c4ca241683bd3a6ce11061f2528d6 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1595,7 +1595,6 @@ int Mdrunner::mdrunner()
                      fr->nbv->gpu_nbv != nullptr
                              ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                              : nullptr;
-            const int          paddingSize = pme_gpu_get_padding_size(fr->pmedata);
              GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                        ? GpuApiCallBehavior::Async
                                                        : GpuApiCallBehavior::Sync;
@@ -1604,7 +1603,7 @@ int Mdrunner::mdrunner()
                      "Device context can not be nullptr when building GPU propagator data object.");
              stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
                      pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    paddingSize, wcycle);
+                    pme_gpu_get_block_size(fr->pmedata), wcycle);
              fr->stateGpu = stateGpu.get();
          }
  
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 678fa3368151ab335fd984215071ea72f874757b..a4f77cbf164d2fcd36f5d0ae25c934bfb2955f6f 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -105,7 +105,7 @@ public:
       *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
       *  \param[in] deviceContext   Device context, nullptr allowed.
       *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
      StatePropagatorDataGpu(const DeviceStream*  pmeStream,
@@ -113,7 +113,7 @@ public:
                             const DeviceStream*  nonLocalStream,
                             const DeviceContext& deviceContext,
                             GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                             gmx_wallcycle*       wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
@@ -129,13 +129,13 @@ public:
       *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
       *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
       *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
      StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                             const DeviceContext& deviceContext,
                             GpuApiCallBehavior   transferKind,
-                           int                  paddingSize,
+                           int                  allocationBlockSizeDivisor,
                             gmx_wallcycle*       wcycle);
  
      //! Move constructor
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 78b1fd3a4ad13962756121af39078114ad7186ee..68c884f99b6e448adba3205219ac68b20e1f978a 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -59,7 +59,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
                                                 const DeviceStream* /* nonLocalStream  */,
                                                 const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                 gmx_wallcycle* /*   wcycle */) :
      impl_(nullptr)
  {
@@ -68,7 +68,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream
  StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
                                                 const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
-                                               int /* paddingSize     */,
+                                               int /* allocationBlockSizeDivisor */,
                                                 gmx_wallcycle* /*   wcycle */) :
      impl_(nullptr)
  {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 1b2c91d2e2bf5ead86a0fca68d9e1095dc92a47e..b0576925479832645669ca9d2fd0e7e6e754ab60 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -104,7 +104,7 @@ public:
       *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
       *  \param[in] deviceContext   Device context, nullptr allowed.
       *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
      Impl(const DeviceStream*  pmeStream,
@@ -112,7 +112,7 @@ public:
           const DeviceStream*  nonLocalStream,
           const DeviceContext& deviceContext,
           GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
           gmx_wallcycle*       wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
@@ -128,13 +128,13 @@ public:
       *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
       *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
       *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
      Impl(const DeviceStream*  pmeStream,
           const DeviceContext& deviceContext,
           GpuApiCallBehavior   transferKind,
-         int                  paddingSize,
+         int                  allocationBlockSizeDivisor,
           gmx_wallcycle*       wcycle);
  
      ~Impl();
@@ -382,8 +382,8 @@ private:
      const DeviceContext& deviceContext_;
      //! Default GPU calls behavior
      GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
-    //! Padding size for the coordinates buffer
-    int paddingSize_ = 0;
+    //! Required minimum divisor of the allocation size of the coordinates buffer
+    int allocationBlockSizeDivisor_ = 0;
  
      //! Number of local atoms
      int numAtomsLocal_ = -1;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index d0027852eea08101e98810cbfdca251894d2c1cb..fca3ae474d182f193c3d257577b284c3ca7f32f4 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -70,11 +70,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                     const DeviceStream*  nonLocalStream,
                                     const DeviceContext& deviceContext,
                                     GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                     gmx_wallcycle*       wcycle) :
      deviceContext_(deviceContext),
      transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
      wcycle_(wcycle)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -135,11 +135,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
  StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                     const DeviceContext& deviceContext,
                                     GpuApiCallBehavior   transferKind,
-                                   int                  paddingSize,
+                                   int                  allocationBlockSizeDivisor,
                                     gmx_wallcycle*       wcycle) :
      deviceContext_(deviceContext),
      transferKind_(transferKind),
-    paddingSize_(paddingSize),
+    allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
      wcycle_(wcycle)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE,
@@ -178,9 +178,10 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      numAtomsAll_   = numAtomsAll;
  
      int numAtomsPadded;
-    if (paddingSize_ > 0)
+    if (allocationBlockSizeDivisor_ > 0)
      {
-        numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
+        numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
+                         * allocationBlockSizeDivisor_;
      }
      else
      {
@@ -550,18 +551,18 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                 const DeviceStream*  nonLocalStream,
                                                 const DeviceContext& deviceContext,
                                                 GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                 gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
  {
  }
  
  StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                 const DeviceContext& deviceContext,
                                                 GpuApiCallBehavior   transferKind,
-                                               int                  paddingSize,
+                                               int                  allocationBlockSizeDivisor,
                                                 gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
+    impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
  {
  }
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 6 Mar 2020 19:43:01 +0000 (20:43 +0100)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Fri, 13 Mar 2020 09:35:09 +0000 (10:35 +0100)
src/gromacs/ewald/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gather.clh		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.clh		patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.cuh		patch \| blob \| history
src/gromacs/ewald/pme_gpu_constants.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_spread.clh		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history