/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const int atomDataSize = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
const int blockSize = atomsPerBlock * atomDataSize;
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+
/* These are the atom indices - for the shared and global memory */
const int atomIndexLocal = threadIdx.z;
- const int atomIndexOffset = blockIdx.x * atomsPerBlock;
+ const int atomIndexOffset = blockIndex * atomsPerBlock;
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+ /* Early return for fully empty blocks at the end
+ * (should only happen on Fermi or billions of input atoms)
+ */
+ if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+ {
+ return;
+ }
+
const int splineParamsSize = atomsPerBlock * DIM * order;
const int gridlineIndicesSize = atomsPerBlock * DIM;
__shared__ int sm_gridlineIndices[gridlineIndicesSize];
/* Staging the atom gridline indices, DIM * atomsPerBlock threads */
const int localGridlineIndicesIndex = threadLocalId;
- const int globalGridlineIndicesIndex = blockIdx.x * gridlineIndicesSize + localGridlineIndicesIndex;
+ const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
{
}
/* Staging the spline parameters, DIM * order * atomsPerBlock threads */
const int localSplineParamsIndex = threadLocalId;
- const int globalSplineParamsIndex = blockIdx.x * splineParamsSize + localSplineParamsIndex;
+ const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
{
for (int i = 0; i < numIter; i++)
{
int outputIndexLocal = i * iterThreads + threadLocalId;
- int outputIndexGlobal = blockIdx.x * blockForcesSize + outputIndexLocal;
+ int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
if (globalOutputCheck)
{
const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
- dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
dim3 dimBlock(order, order, atomsPerBlock);
const bool wrapX = true;
{
if (forceTreatment == PmeForceOutputHandling::Set)
{
- pme_gather_kernel<4, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
}
else
{
- pme_gather_kernel<4, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
}
}
else
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \brief A pointer to the device used during the execution. */
gmx_device_info_t *deviceInfo;
+ /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
+ * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
+ */
+ std::intmax_t maxGridWidthX;
+
/*! \brief A single structure encompassing all the PME data used on GPU.
* Its value is the only argument to all the PME GPU kernels.
* \todo Test whether this should be copied to the constant GPU memory once for each computation
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const T * __restrict__ gm_source)
{
static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
const int localIndex = threadLocalIndex;
- const int globalIndexBase = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+ const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
const int globalIndex = globalIndexBase + localIndex;
const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
// Spline values
__shared__ float sm_theta[atomsPerBlock * DIM * order];
- const int atomIndexOffset = blockIdx.x * atomsPerBlock;
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+ const int atomIndexOffset = blockIndex * atomsPerBlock;
+
+ /* Early return for fully empty blocks at the end
+ * (should only happen on Fermi or billions of input atoms)
+ */
+ if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+ {
+ return;
+ }
/* Staging coefficients/charges for both spline and spread */
pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
- dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
dim3 dimBlock(order, order, atomsPerBlock);
// These should later check for PME decomposition
if (spreadCharges)
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
- pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
- pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
- pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
// TODO: Consider turning on by default when we can detect nr of streams.
pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+
/* Creating a PME CUDA stream */
cudaError_t stat;
int highest_priority, lowest_priority;
return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
}
+/*! \brief \internal
+ * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
+ * to minimize number of unused blocks.
+ */
+template <typename PmeGpu>
+dim3 __host__ inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount)
+{
+ // How many maximum widths in X do we need (hopefully just one)
+ const int minRowCount = (blockCount + pmeGpu->maxGridWidthX - 1) / pmeGpu->maxGridWidthX;
+ // Trying to make things even
+ const int colCount = (blockCount + minRowCount - 1) / minRowCount;
+ GMX_ASSERT((colCount * minRowCount - blockCount) >= 0, "pmeGpuCreateGrid: totally wrong");
+ GMX_ASSERT((colCount * minRowCount - blockCount) < minRowCount, "pmeGpuCreateGrid: excessive blocks");
+ return dim3(colCount, minRowCount);
+}
+
/*! \brief \internal
* The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer.
*/