fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
if (runScheduleWork->simulationWork.useGpuPme
- && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+ && (!runScheduleWork->simulationWork.haveSeparatePmeRank
+ || runScheduleWork->simulationWork.useGpuPmePpCommunication))
{
DeviceBuffer<gmx::RVec> forcePtr =
- thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
- : // PME force buffer on same GPU
- fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+ runScheduleWork->simulationWork.haveSeparatePmeRank
+ ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
+ : pme_gpu_get_device_f(fr->pmedata); // PME force buffer on same GPU
fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
GpuEventSynchronizer* const pmeSynchronizer =
- (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
- : // PME force buffer on same GPU
- fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-
+ (runScheduleWork->simulationWork.haveSeparatePmeRank
+ ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU
+ : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU
if (GMX_THREAD_MPI)
{
GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
// to a remote task for halo exchange or PME-PP communication. At
// search steps the current coordinates are already on the host,
// hence copy is not needed.
- const bool haveHostPmePpComms =
- !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
- || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
+ || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
+ || simulationWork.computeMuTot))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
haveCopiedXFromGpu = true;
}
}
- if (!thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces)
+ if (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
{
/* Send particle coordinates to the pme nodes */
if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
/* Reset energies */
reset_enerdata(enerd);
- if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
+ if (DOMAINDECOMP(cr) && simulationWork.haveSeparatePmeRank)
{
wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
dd_force_flop_start(cr->dd, nrnb);
// If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
// TODO refactor this and unify with below default-path call to the same function
- if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+ if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useGpuPmePpCommunication
&& stepWork.computeSlowForces)
{
/* In case of node-splitting, the PP nodes receive the long-range
// NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
// they should not be copied in do_md(...) for the output.
if (!simulationWork.useGpuUpdate
- || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
+ || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && simulationWork.useCpuPmePpCommunication)
+ || vsite)
{
stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
}
// TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
- if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+ if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useCpuPmePpCommunication
&& stepWork.computeSlowForces)
{
/* In case of node-splitting, the PP nodes receive the long-range
const bool disableNonbondedCalculation,
const DevelopmentFeatureFlags& devFlags,
bool havePpDomainDecomposition,
+ bool haveSeparatePmeRank,
bool useGpuForNonbonded,
PmeRunMode pmeRunMode,
bool useGpuForBonded,
simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo;
simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
+ if (pmeRunMode == PmeRunMode::None)
+ {
+ GMX_RELEASE_ASSERT(!haveSeparatePmeRank, "Can not have separate PME rank(s) without PME.");
+ }
+ simulationWorkload.haveSeparatePmeRank = haveSeparatePmeRank;
simulationWorkload.useGpuPmePpCommunication =
- devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+ haveSeparatePmeRank && devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+ simulationWorkload.useCpuPmePpCommunication =
+ haveSeparatePmeRank && !simulationWorkload.useGpuPmePpCommunication;
+ GMX_RELEASE_ASSERT(!(simulationWorkload.useGpuPmePpCommunication
+ && simulationWorkload.useCpuPmePpCommunication),
+ "Cannot do PME-PP communication on both CPU and GPU");
simulationWorkload.useGpuDirectCommunication =
devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm;
simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution(inputrec);