From 817bebaea92727ec0ca9e9308b403ff10725e7fb Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 12 Aug 2021 18:57:42 +0200
Subject: [PATCH] Add PP decomposition simluationWorkload flags

Add the complete set of flags that indicate whether there is PP
decomposition using CPU/GPU communication.

These flags allow simplifications in the force schedule.

The additional benefit is a reduced reliance on passing around commrec
for checks related to parallelization.

Refs #3913
---
 src/gromacs/mdlib/sim_util.cpp                | 78 +++++++++----------
 src/gromacs/mdrun/runner.cpp                  |  1 +
 src/gromacs/mdtypes/simulation_workload.h     |  4 +
 .../decidesimulationworkload.cpp              | 15 ++--
 .../taskassignment/decidesimulationworkload.h | 14 ++--
 5 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 8c4c86a52d..bda10e7649 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -900,12 +900,11 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
 
 /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
  */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
-                                                          const t_forcerec& fr,
-                                                          const pull_t*     pull_work,
-                                                          const gmx_edsam*  ed,
-                                                          const t_mdatoms&  mdatoms,
-                                                          bool havePPDomainDecomposition,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
+                                                          const t_forcerec&         fr,
+                                                          const pull_t*             pull_work,
+                                                          const gmx_edsam*          ed,
+                                                          const t_mdatoms&          mdatoms,
                                                           const SimulationWorkload& simulationWork,
                                                           const StepWorkload&       stepWork)
 {
@@ -938,7 +937,7 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inpu
             || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
             || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
     domainWork.haveLocalForceContribInCpuBuffer =
-            domainWork.haveCpuLocalForceWork || havePPDomainDecomposition;
+            domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
     domainWork.haveNonLocalForceContribInCpuBuffer =
             domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
 
@@ -989,8 +988,11 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
     flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
     flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                 && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo          = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo          = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    // Note that rankHasPmeDuty is used confusingly due to the way cr->duty is set up (can be true even for non-PME runs),
+    // but the haveGpuPmeOnThisRank still ends up correct as simulationWork.useGpuPme == false in such cases.
+    // TODO: improve this when duty-reliance is eliminated
     flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces;
     flags.combineMtsForcesBeforeHaloExchange =
             (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
@@ -1131,8 +1133,8 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 
     // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+    const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
+                            || runScheduleWork->simulationWork.havePpDomainDecomposition;
     const int atomStart = 0;
     fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
                                                             nbv->getNumAtoms(AtomLocality::Local),
@@ -1168,7 +1170,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
     {
         // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
-        if (!havePPDomainDecomposition(cr))
+        if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
         {
             const bool useGpuForceBufferOps = true;
             fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
@@ -1182,7 +1184,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                 cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
     }
 
-    if (havePPDomainDecomposition(cr))
+    if (runScheduleWork->simulationWork.havePpDomainDecomposition)
     {
         // (re-)initialize non-local GPU force reduction
         const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
@@ -1302,6 +1304,11 @@ void do_force(FILE*                               fplog,
                                                 AtomLocality::Local, simulationWork, stepWork)
                                         : nullptr;
 
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
     // Copy coordinate from the GPU if update is on the GPU and there
     // are forces to be computed on the CPU, or for the computation of
     // virial, or if host-side data will be transferred from this task
@@ -1310,17 +1317,9 @@ void do_force(FILE*                               fplog,
     // hence copy is not needed.
     const bool haveHostPmePpComms =
             !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
         && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
+            || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
     {
         stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
         haveCopiedXFromGpu = true;
@@ -1336,7 +1335,7 @@ void do_force(FILE*                               fplog,
         {
             // TODO refactor this to do_md, after partitioning.
             stateGpu->reinit(mdatoms->homenr,
-                             getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)));
+                             getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
             if (stepWork.haveGpuPmeOnThisRank)
             {
                 // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
@@ -1464,7 +1463,7 @@ void do_force(FILE*                               fplog,
         // Need to run after the GPU-offload bonded interaction lists
         // are set up to be able to determine whether there is bonded work.
         runScheduleWork->domainWork = setupDomainLifetimeWorkload(
-                inputrec, *fr, pull_work, ed, *mdatoms, havePPDomainDecomposition(cr), simulationWork, stepWork);
+                inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
 
         wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
         wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
@@ -1523,7 +1522,7 @@ void do_force(FILE*                               fplog,
 
         // bonded work not split into separate local and non-local, so with DD
         // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+        if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
         {
             fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
         }
@@ -1550,7 +1549,7 @@ void do_force(FILE*                               fplog,
 
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
     {
         if (stepWork.doNeighborSearch)
         {
@@ -1646,7 +1645,7 @@ void do_force(FILE*                               fplog,
         wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
         wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
 
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
         }
@@ -1672,7 +1671,7 @@ void do_force(FILE*                               fplog,
     {
         const bool needCoordsOnHost  = (runScheduleWork->domainWork.haveCpuLocalForceWork
                                        || stepWork.computeVirial || simulationWork.computeMuTot);
-        const bool haveAlreadyWaited = haveHostHaloExchangeComms;
+        const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange;
         if (needCoordsOnHost && !haveAlreadyWaited)
         {
             GMX_ASSERT(haveCopiedXFromGpu,
@@ -1736,7 +1735,7 @@ void do_force(FILE*                               fplog,
      * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
      */
     ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
-            &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle);
+            &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
 
     // Force output for MTS combined forces, only set at level1 MTS steps
     std::optional<ForceOutputs> forceOutMts =
@@ -1745,7 +1744,7 @@ void do_force(FILE*                               fplog,
                                                       forceView->forceMtsCombinedWithPadding(),
                                                       domainWork,
                                                       stepWork,
-                                                      havePPDomainDecomposition(cr),
+                                                      simulationWork.havePpDomainDecomposition,
                                                       wcycle))
                     : std::nullopt;
 
@@ -1807,7 +1806,7 @@ void do_force(FILE*                               fplog,
                 stepWork,
                 nrnb);
 
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
         {
             nbv->dispatchFreeEnergyKernel(
                     InteractionLocality::NonLocal,
@@ -1838,7 +1837,7 @@ void do_force(FILE*                               fplog,
 
     if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
     {
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
         {
             do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
         }
@@ -2008,7 +2007,7 @@ void do_force(FILE*                               fplog,
                          ed,
                          stepWork.doNeighborSearch);
 
-    if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo
+    if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
         && domainWork.haveCpuLocalForceWork)
     {
         stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
@@ -2026,7 +2025,7 @@ void do_force(FILE*                               fplog,
         auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
 
         /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
         {
             if (simulationWork.useGpuNonbonded)
             {
@@ -2082,13 +2081,13 @@ void do_force(FILE*                               fplog,
      */
     if (stepWork.combineMtsForcesBeforeHaloExchange)
     {
-        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)),
+        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
                          force.unpaddedArrayRef(),
                          forceView->forceMtsCombined(),
                          inputrec.mtsLevels[1].stepFactor);
     }
 
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
     {
         /* We are done with the CPU compute.
          * We will now communicate the non-local forces.
@@ -2215,8 +2214,8 @@ void do_force(FILE*                               fplog,
 
     // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
     // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && simulationWork.useGpuPmePpCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+        && stepWork.computeSlowForces)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
@@ -2255,7 +2254,8 @@ void do_force(FILE*                               fplog,
                 // CPU force H2D with GPU force tasks on all streams including those in the
                 // local stream which would otherwise be implicit dependencies for the
                 // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+                auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
+                                                                         : AtomLocality::All;
 
                 stateGpu->copyForcesToGpu(forceWithShift, locality);
             }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index ec71586d46..e3eb7ff8f2 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1443,6 +1443,7 @@ int Mdrunner::mdrunner()
     runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
                                                               disableNonbondedCalculation,
                                                               devFlags,
+                                                              havePPDomainDecomposition(cr),
                                                               useGpuForNonbonded,
                                                               pmeRunMode,
                                                               useGpuForBonded,
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index 0d7ffe8225..256d8366b8 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -180,6 +180,10 @@ public:
     bool useGpuUpdate = false;
     //! If buffer operations are performed on GPU.
     bool useGpuBufferOps = false;
+    //! If PP domain decomposition is active.
+    bool havePpDomainDecomposition = false;
+    //! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication).
+    bool useCpuHaloExchange = false;
     //! If domain decomposition halo exchange is performed on GPU.
     bool useGpuHaloExchange = false;
     //! If direct PP-PME communication between GPU is used.
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp
index bd0c5a55a6..e21c8580ad 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -55,11 +55,12 @@ namespace gmx
 SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             const bool        disableNonbondedCalculation,
                                             const DevelopmentFeatureFlags& devFlags,
-                                            bool                           useGpuForNonbonded,
-                                            PmeRunMode                     pmeRunMode,
-                                            bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate,
-                                            bool                           useGpuDirectHalo)
+                                            bool       havePpDomainDecomposition,
+                                            bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuDirectHalo)
 {
     SimulationWorkload simulationWorkload;
     simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
@@ -76,7 +77,9 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
     simulationWorkload.useGpuUpdate    = useGpuForUpdate;
     simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
                                          && !simulationWorkload.computeNonbondedAtMtsLevel1;
-    simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
+    simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
+    simulationWorkload.useCpuHaloExchange        = havePpDomainDecomposition && !useGpuDirectHalo;
+    simulationWorkload.useGpuHaloExchange        = useGpuDirectHalo;
     simulationWorkload.useGpuPmePpCommunication =
             devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
     simulationWorkload.useGpuDirectCommunication =
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h
index c18f9a04d1..e56e543d61 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -61,6 +61,7 @@ struct DevelopmentFeatureFlags;
  * \param[in] inputrec           The input record
  * \param[in] disableNonbondedCalculation  Disable calculation of nonbonded forces
  * \param[in] devFlags           The development feature flags
+ * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run.
  * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions
  *                               calculations on GPU(s).
  * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
@@ -73,11 +74,12 @@ struct DevelopmentFeatureFlags;
 SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool              disableNonbondedCalculation,
                                             const DevelopmentFeatureFlags& devFlags,
-                                            bool                           useGpuForNonbonded,
-                                            PmeRunMode                     pmeRunMode,
-                                            bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate,
-                                            bool                           useGpuDirectHalo);
+                                            bool       havePpDomainDecomposition,
+                                            bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuDirectHalo);
 
 } // namespace gmx
 
-- 
2.22.0