/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
*/
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
- const t_forcerec& fr,
- const pull_t* pull_work,
- const gmx_edsam* ed,
- const t_mdatoms& mdatoms,
- bool havePPDomainDecomposition,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
+ const t_forcerec& fr,
+ const pull_t* pull_work,
+ const gmx_edsam* ed,
+ const t_mdatoms& mdatoms,
const SimulationWorkload& simulationWork,
const StepWorkload& stepWork)
{
|| domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
|| simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
domainWork.haveLocalForceContribInCpuBuffer =
- domainWork.haveCpuLocalForceWork || havePPDomainDecomposition;
+ domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
domainWork.haveNonLocalForceContribInCpuBuffer =
domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
&& (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
- flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
- flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+ flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+ flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+ // Note that rankHasPmeDuty is used confusingly due to the way cr->duty is set up (can be true even for non-PME runs),
+ // but the haveGpuPmeOnThisRank still ends up correct as simulationWork.useGpuPme == false in such cases.
+ // TODO: improve this when duty-reliance is eliminated
flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces;
flags.combineMtsForcesBeforeHaloExchange =
(flags.computeForces && simulationWork.useMts && flags.computeSlowForces
gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
// (re-)initialize local GPU force reduction
- const bool accumulate =
- runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+ const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
+ || runScheduleWork->simulationWork.havePpDomainDecomposition;
const int atomStart = 0;
fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
nbv->getNumAtoms(AtomLocality::Local),
if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
{
// in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
- if (!havePPDomainDecomposition(cr))
+ if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
{
const bool useGpuForceBufferOps = true;
fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
}
- if (havePPDomainDecomposition(cr))
+ if (runScheduleWork->simulationWork.havePpDomainDecomposition)
{
// (re-)initialize non-local GPU force reduction
const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
AtomLocality::Local, simulationWork, stepWork)
: nullptr;
+ GMX_ASSERT(simulationWork.useGpuHaloExchange
+ == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+ "The GPU halo exchange is active, but it has not been constructed.");
+
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
// Copy coordinate from the GPU if update is on the GPU and there
// are forces to be computed on the CPU, or for the computation of
// virial, or if host-side data will be transferred from this task
// hence copy is not needed.
const bool haveHostPmePpComms =
!thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
- GMX_ASSERT(simulationWork.useGpuHaloExchange
- == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
- "The GPU halo exchange is active, but it has not been constructed.");
- const bool haveHostHaloExchangeComms =
- havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
- bool gmx_used_in_debug haveCopiedXFromGpu = false;
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
- || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
+ || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
haveCopiedXFromGpu = true;
{
// TODO refactor this to do_md, after partitioning.
stateGpu->reinit(mdatoms->homenr,
- getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)));
+ getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
if (stepWork.haveGpuPmeOnThisRank)
{
// TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
// Need to run after the GPU-offload bonded interaction lists
// are set up to be able to determine whether there is bonded work.
runScheduleWork->domainWork = setupDomainLifetimeWorkload(
- inputrec, *fr, pull_work, ed, *mdatoms, havePPDomainDecomposition(cr), simulationWork, stepWork);
+ inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
// bonded work not split into separate local and non-local, so with DD
// we can only launch the kernel after non-local coordinates have been received.
- if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+ if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
{
fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
}
/* Communicate coordinates and sum dipole if necessary +
do non-local pair search */
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
if (stepWork.doNeighborSearch)
{
wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
}
{
const bool needCoordsOnHost = (runScheduleWork->domainWork.haveCpuLocalForceWork
|| stepWork.computeVirial || simulationWork.computeMuTot);
- const bool haveAlreadyWaited = haveHostHaloExchangeComms;
+ const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange;
if (needCoordsOnHost && !haveAlreadyWaited)
{
GMX_ASSERT(haveCopiedXFromGpu,
* With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
*/
ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
- &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle);
+ &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
// Force output for MTS combined forces, only set at level1 MTS steps
std::optional<ForceOutputs> forceOutMts =
forceView->forceMtsCombinedWithPadding(),
domainWork,
stepWork,
- havePPDomainDecomposition(cr),
+ simulationWork.havePpDomainDecomposition,
wcycle))
: std::nullopt;
stepWork,
nrnb);
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
nbv->dispatchFreeEnergyKernel(
InteractionLocality::NonLocal,
if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
{
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
}
ed,
stepWork.doNeighborSearch);
- if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo
+ if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
&& domainWork.haveCpuLocalForceWork)
{
stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
/* wait for non-local forces (or calculate in emulation mode) */
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
if (simulationWork.useGpuNonbonded)
{
*/
if (stepWork.combineMtsForcesBeforeHaloExchange)
{
- combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)),
+ combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
force.unpaddedArrayRef(),
forceView->forceMtsCombined(),
inputrec.mtsLevels[1].stepFactor);
}
- if (havePPDomainDecomposition(cr))
+ if (simulationWork.havePpDomainDecomposition)
{
/* We are done with the CPU compute.
* We will now communicate the non-local forces.
// If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
// TODO refactor this and unify with below default-path call to the same function
- if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
- && simulationWork.useGpuPmePpCommunication)
+ if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+ && stepWork.computeSlowForces)
{
/* In case of node-splitting, the PP nodes receive the long-range
* forces, virial and energy from the PME nodes here.
// CPU force H2D with GPU force tasks on all streams including those in the
// local stream which would otherwise be implicit dependencies for the
// transfer and would not overlap.
- auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+ auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
+ : AtomLocality::All;
stateGpu->copyForcesToGpu(forceWithShift, locality);
}