Merge branch release-2018 into master

[alexxy/gromacs.git] / src / gromacs / ewald / pme-spread.cu
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu

index b2644adcc4f372aa34c8daf19baea6f290796515..fbf148f780444396b1d58e9e38b6013b1c34281f 100644 (file)
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -97,9 +97,10 @@ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
                               const T * __restrict__             gm_source)
  {
      static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
      const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
      const int localIndex       = threadLocalIndex;
-    const int globalIndexBase  = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
      const int globalIndex      = globalIndexBase + localIndex;
      const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
      if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
@@ -433,7 +434,16 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
      // Spline values
      __shared__ float sm_theta[atomsPerBlock * DIM * order];
  
-    const int        atomIndexOffset = blockIdx.x * atomsPerBlock;
+    const int        blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+    const int        atomIndexOffset = blockIndex * atomsPerBlock;
+
+    /* Early return for fully empty blocks at the end
+     * (should only happen on Fermi or billions of input atoms)
+     */
+    if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+    {
+        return;
+    }
  
      /* Staging coefficients/charges for both spline and spread */
      pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
@@ -491,7 +501,8 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
      //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
      GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
  
-    dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
      dim3 dimBlock(order, order, atomsPerBlock);
  
      // These should later check for PME decomposition
@@ -509,14 +520,14 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
                  if (spreadCharges)
                  {
                      pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
-                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                      CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                      pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
                  }
                  else
                  {
                      pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
-                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                      CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                      pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
                  }
@@ -524,7 +535,7 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
              else
              {
                  pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
-                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
                  CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
                  pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
              }