Fix PME for large systems with Fermi GPUs

author Aleksei Iupinov <a.yupinov@gmail.com>

Thu, 15 Feb 2018 12:27:48 +0000 (13:27 +0100)

committer Aleksei Iupinov <a.yupinov@gmail.com>

Fri, 16 Feb 2018 09:53:30 +0000 (10:53 +0100)
author Aleksei Iupinov <a.yupinov@gmail.com>
Thu, 15 Feb 2018 12:27:48 +0000 (13:27 +0100)
committer Aleksei Iupinov <a.yupinov@gmail.com>
Fri, 16 Feb 2018 09:53:30 +0000 (10:53 +0100)
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu

index b5acaa50b3262ce33fd829163dd5dd734a8ac9f9..ae0667d0bf54e5ffdee5af7ebd6e1bb2bd36d765 100644 (file)
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -250,11 +250,21 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      const int    atomDataSize   = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
      const int    blockSize      = atomsPerBlock * atomDataSize;
  
+    const int    blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+
      /* These are the atom indices - for the shared and global memory */
      const int         atomIndexLocal    = threadIdx.z;
-    const int         atomIndexOffset   = blockIdx.x * atomsPerBlock;
+    const int         atomIndexOffset   = blockIndex * atomsPerBlock;
      const int         atomIndexGlobal   = atomIndexOffset + atomIndexLocal;
  
+    /* Early return for fully empty blocks at the end
+     * (should only happen on Fermi or billions of input atoms)
+     */
+    if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+    {
+        return;
+    }
+
      const int         splineParamsSize             = atomsPerBlock * DIM * order;
      const int         gridlineIndicesSize          = atomsPerBlock * DIM;
      __shared__ int    sm_gridlineIndices[gridlineIndicesSize];
@@ -274,7 +284,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
  
      /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
      const int localGridlineIndicesIndex  = threadLocalId;
-    const int globalGridlineIndicesIndex = blockIdx.x * gridlineIndicesSize + localGridlineIndicesIndex;
+    const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
      const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
      if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
      {
@@ -283,7 +293,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      }
      /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
      const int localSplineParamsIndex  = threadLocalId;
-    const int globalSplineParamsIndex = blockIdx.x * splineParamsSize + localSplineParamsIndex;
+    const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
      const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
      if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
      {
@@ -392,7 +402,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
          for (int i = 0; i < numIter; i++)
          {
              int         outputIndexLocal  = i * iterThreads + threadLocalId;
-            int         outputIndexGlobal = blockIdx.x * blockForcesSize + outputIndexLocal;
+            int         outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
              const int   globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
              if (globalOutputCheck)
              {
@@ -438,7 +448,8 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
      const int atomsPerBlock  =  (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
      GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
  
-    dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
      dim3 dimBlock(order, order, atomsPerBlock);
  
      const bool wrapX = true;
@@ -453,11 +464,11 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
      {
          if (forceTreatment == PmeForceOutputHandling::Set)
          {
-            pme_gather_kernel<4, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+            pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
          }
          else
          {
-            pme_gather_kernel<4, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+            pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
          }
      }
      else
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index 14b72427028ef497c8300bb89a8127ae3e84bc62..5d689c1cf9d6ca4ca6584be42ba371f1992b08b7 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -350,6 +350,11 @@ struct PmeGpu
      /*! \brief A pointer to the device used during the execution. */
      gmx_device_info_t *deviceInfo;
  
+    /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
+     * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
+     */
+    std::intmax_t maxGridWidthX;
+
      /*! \brief A single structure encompassing all the PME data used on GPU.
       * Its value is the only argument to all the PME GPU kernels.
       * \todo Test whether this should be copied to the constant GPU memory once for each computation
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu

index c13179fdf9c54e7b2bdc573f734055c6dd73c18e..20bb4e509063dd4c6faf5247c39722dae24677eb 100644 (file)
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -112,9 +112,10 @@ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
                               const T * __restrict__             gm_source)
  {
      static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
      const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
      const int localIndex       = threadLocalIndex;
-    const int globalIndexBase  = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
      const int globalIndex      = globalIndexBase + localIndex;
      const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
      if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
@@ -454,7 +455,16 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
      // Spline values
      __shared__ float sm_theta[atomsPerBlock * DIM * order];
  
-    const int        atomIndexOffset = blockIdx.x * atomsPerBlock;
+    const int        blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+    const int        atomIndexOffset = blockIndex * atomsPerBlock;
+
+    /* Early return for fully empty blocks at the end
+     * (should only happen on Fermi or billions of input atoms)
+     */
+    if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+    {
+        return;
+    }
  
      /* Staging coefficients/charges for both spline and spread */
      pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
@@ -512,7 +522,8 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
      //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
      GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
  
-    dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
      dim3 dimBlock(order, order, atomsPerBlock);
  
      // These should later check for PME decomposition
@@ -530,14 +541,14 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
                  if (spreadCharges)
                  {
                      pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
-                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                      CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                      pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
                  }
                  else
                  {
                      pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
-                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                      CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                      pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
                  }
@@ -545,7 +556,7 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
              else
              {
                  pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
-                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                  CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                  pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
              }
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index f677c7de37440cbc90e8a4c29a68e2ac9ededfb7..735ff5f9b6d619c3616c29b167dd0f6ddb14a804 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -452,6 +452,8 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu)
      // TODO: Consider turning on by default when we can detect nr of streams.
      pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
  
+    pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+
      /* Creating a PME CUDA stream */
      cudaError_t stat;
      int         highest_priority, lowest_priority;
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 5f3b04b794c58897e2223060cde274eb2a8bc99e..3c018d7f3cac4837bbd7973837af61e2a2356854 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -137,6 +137,22 @@ int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient
      return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
  }
  
+/*! \brief \internal
+ * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
+ * to minimize number of unused blocks.
+ */
+template <typename PmeGpu>
+dim3 __host__ inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount)
+{
+    // How many maximum widths in X do we need (hopefully just one)
+    const int minRowCount = (blockCount + pmeGpu->maxGridWidthX - 1) / pmeGpu->maxGridWidthX;
+    // Trying to make things even
+    const int colCount = (blockCount + minRowCount - 1) / minRowCount;
+    GMX_ASSERT((colCount * minRowCount - blockCount) >= 0, "pmeGpuCreateGrid: totally wrong");
+    GMX_ASSERT((colCount * minRowCount - blockCount) < minRowCount, "pmeGpuCreateGrid: excessive blocks");
+    return dim3(colCount, minRowCount);
+}
+
  /*! \brief \internal
   * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer.
   */
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Thu, 15 Feb 2018 12:27:48 +0000 (13:27 +0100)
committer	Aleksei Iupinov <a.yupinov@gmail.com>
	Fri, 16 Feb 2018 09:53:30 +0000 (10:53 +0100)
src/gromacs/ewald/pme-gather.cu		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types.h		patch \| blob \| history
src/gromacs/ewald/pme-spread.cu		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history