return std::pair<int, int>(colCount, minRowCount);
}
-void pme_gpu_spread(const PmeGpu *pmeGpu,
- int gmx_unused gridIndex,
- real *h_grid,
- bool computeSplines,
- bool spreadCharges)
+void pme_gpu_spread(const PmeGpu *pmeGpu,
+ GpuEventSynchronizer *xReadyOnDevice,
+ int gmx_unused gridIndex,
+ real *h_grid,
+ bool computeSplines,
+ bool spreadCharges)
{
GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+ // Ensure that coordinates are ready on the device before launching spread;
+ // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+ // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+ // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+ // here separate PME ranks.
+ if (xReadyOnDevice)
+ {
+ xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+ }
+
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);