const T * __restrict__ gm_source)
{
static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
const int localIndex = threadLocalIndex;
- const int globalIndexBase = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+ const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
const int globalIndex = globalIndexBase + localIndex;
const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
// Spline values
__shared__ float sm_theta[atomsPerBlock * DIM * order];
- const int atomIndexOffset = blockIdx.x * atomsPerBlock;
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+ const int atomIndexOffset = blockIndex * atomsPerBlock;
+
+ /* Early return for fully empty blocks at the end
+ * (should only happen on Fermi or billions of input atoms)
+ */
+ if (atomIndexOffset >= kernelParams.atoms.nAtoms)
+ {
+ return;
+ }
/* Staging coefficients/charges for both spline and spread */
pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
- dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
dim3 dimBlock(order, order, atomsPerBlock);
// These should later check for PME decomposition
if (spreadCharges)
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
- pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
- pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
}
else
{
pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
- pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
}