* Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
* \param[in] wcycle The wallclock counter.
*/
GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
gmx_wallcycle *wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+ GMX_ASSERT(xReadyOnDevice || !pme->bPPnode || (GMX_GPU != GMX_GPU_CUDA), "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
PmeGpu *pmeGpu = pme->gpu;
pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
- pmeGpu->common->runMode = pme->runMode;
- pmeGpu->common->boxScaler = pme->boxScaler;
+ pmeGpu->common->runMode = pme->runMode;
+ pmeGpu->common->isRankPmeOnly = !pme->bPPnode;
+ pmeGpu->common->boxScaler = pme->boxScaler;
}
/*! \libinternal \brief
// Ensure that coordinates are ready on the device before launching spread;
// only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
// nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
- // Note: Consider adding an assertion on xReadyOnDevice when we can detect
- // here separate PME ranks.
+ GMX_ASSERT(xReadyOnDevice != nullptr ||
+ (GMX_GPU != GMX_GPU_CUDA) || pmeGpu->common->isRankPmeOnly || pme_gpu_is_testing(pmeGpu),
+ "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
if (xReadyOnDevice)
{
xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
std::vector<real> bsp_mod[DIM];
/*! \brief The PME codepath being taken */
PmeRunMode runMode;
+ /*! \brief Whether PME execution is happening on a PME-only rank (from gmx_pme_t.bPPnode). */
+ bool isRankPmeOnly;
/*! \brief The box scaler based on inputrec - created in pme_init and managed by CPU structure */
class EwaldBoxZScaler *boxScaler;
/*! \brief The previous computation box to know if we even need to update the current box params.