From ed556fe1766fa4263c0315b548886dd7d497ff40 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Fri, 2 Oct 2020 07:02:31 +0000
Subject: [PATCH] Use workload data structures for GPU halo exchange triggers

Move GPU halo exchange trigger booleans and related conditionals into
workload data structures, and remove unnecessary assertion on GPU
buffer ops being active (since it is now automatically activated when
GPU halo exchange is active).

Partly addresses #3370
---
 src/gromacs/mdlib/sim_util.cpp                | 48 +++++++++----------
 src/gromacs/mdrun/md.cpp                      |  2 +-
 src/gromacs/mdrun/runner.cpp                  | 11 +++--
 src/gromacs/mdtypes/simulation_workload.h     |  4 ++
 src/gromacs/taskassignment/decidegpuusage.cpp | 11 +++++
 src/gromacs/taskassignment/decidegpuusage.h   | 18 +++++++
 .../decidesimulationworkload.cpp              |  5 +-
 .../taskassignment/decidesimulationworkload.h |  4 +-
 8 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 2af9a25280..ef27166978 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -911,6 +911,8 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
     flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
     flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                 && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
 
     return flags;
 }
@@ -1037,12 +1039,10 @@ static void combineMtsForces(const int      numAtoms,
  * \param [in] runScheduleWork               Schedule workload flag structure
  * \param [in] cr                            Communication record object
  * \param [in] fr                            Force record object
- * \param [in] ddUsesGpuDirectCommunication  Whether GPU direct communication is in use
  */
 static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                                     const t_commrec*            cr,
-                                    t_forcerec*                 fr,
-                                    bool                        ddUsesGpuDirectCommunication)
+                                    t_forcerec*                 fr)
 {
 
     nonbonded_verlet_t*          nbv      = fr->nbv.get();
@@ -1075,13 +1075,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     }
 
     if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !ddUsesGpuDirectCommunication)
+        && !runScheduleWork->simulationWork.useGpuHaloExchange)
     {
         fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                 stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
     }
 
-    if (ddUsesGpuDirectCommunication)
+    if (runScheduleWork->simulationWork.useGpuHaloExchange)
     {
         fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                 cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
@@ -1246,17 +1246,6 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // TODO Update this comment when introducing SimulationWorkload
-    //
-    // The conditions for gpuHaloExchange e.g. using GPU buffer
-    // operations were checked before construction, so here we can
-    // just use it and assert upon any conditions.
-    const bool ddUsesGpuDirectCommunication =
-            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty()));
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
-               "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
-
     // Copy coordinate from the GPU if update is on the GPU and there
     // are forces to be computed on the CPU, or for the computation of
     // virial, or if host-side data will be transferred from this task
@@ -1265,7 +1254,12 @@ void do_force(FILE*                               fplog,
     // hence copy is not needed.
     const bool haveHostPmePpComms =
             !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-    const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+    const bool haveHostHaloExchangeComms =
+            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
 
     bool gmx_used_in_debug haveCopiedXFromGpu = false;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
@@ -1383,7 +1377,7 @@ void do_force(FILE*                               fplog,
 
         if (simulationWork.useGpuBufferOps)
         {
-            setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
+            setupGpuForceReductions(runScheduleWork, cr, fr);
         }
     }
     else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
@@ -1466,14 +1460,14 @@ void do_force(FILE*                               fplog,
             // to location in do_md where GPU halo exchange is
             // constructed at partitioning, after above stateGpu
             // re-initialization has similarly been refactored
-            if (ddUsesGpuDirectCommunication)
+            if (simulationWork.useGpuHaloExchange)
             {
                 reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
             }
         }
         else
         {
-            if (ddUsesGpuDirectCommunication)
+            if (stepWork.useGpuXHalo)
             {
                 // The following must be called after local setCoordinates (which records an event
                 // when the coordinate data has been copied to the device).
@@ -1496,7 +1490,7 @@ void do_force(FILE*                               fplog,
 
             if (stepWork.useGpuXBufferOps)
             {
-                if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
+                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
                 {
                     stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                 }
@@ -1706,7 +1700,7 @@ void do_force(FILE*                               fplog,
     }
 
     // TODO Force flags should include haveFreeEnergyWork for this domain
-    if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
+    if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
     {
         /* Wait for non-local coordinate data to be copied from device */
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
@@ -1792,7 +1786,7 @@ void do_force(FILE*                               fplog,
 
     GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
                "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && useGpuForcesHaloExchange),
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
                "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
@@ -1832,9 +1826,10 @@ void do_force(FILE*                               fplog,
                                               AtomLocality::NonLocal);
                 }
 
+
                 fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
 
-                if (!useGpuForcesHaloExchange)
+                if (!stepWork.useGpuFHalo)
                 {
                     // copy from GPU input for dd_move_f()
                     stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
@@ -1878,7 +1873,8 @@ void do_force(FILE*                               fplog,
 
         if (stepWork.computeForces)
         {
-            if (useGpuForcesHaloExchange)
+
+            if (stepWork.useGpuFHalo)
             {
                 if (domainWork.haveCpuLocalForceWork)
                 {
@@ -2004,7 +2000,7 @@ void do_force(FILE*                               fplog,
             // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
             //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
             //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
+            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
             {
                 // Note: AtomLocality::All is used for the non-DD case because, as in this
                 // case copyForcesToGpu() uses a separate stream, it allows overlap of
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index e2f5581f3a..bf7aa914be 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -850,7 +850,7 @@ void gmx::LegacySimulator::do_md()
         }
 
         // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded)
+        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
         {
             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
                                "GPU device manager has to be initialized to use GPU "
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index d32550e182..b22881a05d 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1259,10 +1259,15 @@ int Mdrunner::mdrunner()
     }
 
     MdrunScheduleWorkload runScheduleWork;
+
+    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
+            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
+            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
+
     // Also populates the simulation constant workload description.
-    runScheduleWork.simulationWork =
-            createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags,
-                                     useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate);
+    runScheduleWork.simulationWork = createSimulationWorkload(
+            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
+            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
 
     std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
 
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index f2d08c5847..c24ba0724c 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -93,6 +93,10 @@ public:
     bool useGpuFBufferOps = false;
     //! Whether PME forces are reduced with other contributions on the GPU this step
     bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
+    //! Whether GPU coordinates halo exchange is active this step
+    bool useGpuXHalo = false;
+    //! Whether GPU forces halo exchange is active this step
+    bool useGpuFHalo = false;
 };
 
 /*! \libinternal
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 85dfe60e93..be1657af8e 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -714,4 +714,15 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
             || (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault));
 }
 
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization)
+{
+    return havePPDomainDecomposition && devFlags.enableGpuHaloExchange && useGpuForNonbonded
+           && !useModularSimulator && !doRerun && !haveEnergyMinimization;
+}
+
 } // namespace gmx
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index 5d15244416..b151c5d6ea 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -301,6 +301,24 @@ bool decideWhetherToUseGpuForUpdate(bool                           isDomainDecom
                                     const gmx::MDLogger&           mdlog);
 
 
+/*! \brief Decide whether to use GPU for halo exchange.
+ *
+ * \param[in]  devFlags                     GPU development / experimental feature flags.
+ * \param[in]  havePPDomainDecomposition    Whether PP domain decomposition is in use.
+ * \param[in]  useGpuForNonbonded           Whether GPUs will be used for nonbonded interactions.
+ * \param[in]  useModularSimulator          Whether modularsimulator is in use.
+ * \param[in]  doRerun                      Whether this is a rerun.
+ * \param[in]  haveEnergyMinimization       Whether energy minimization is in use.
+ *
+ * \returns    Whether halo exchange can be run on GPU.
+ */
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization);
+
 } // namespace gmx
 
 #endif
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp
index 06b1eeb29d..5c5cdaeb3f 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -58,7 +58,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool                           useGpuForNonbonded,
                                             PmeRunMode                     pmeRunMode,
                                             bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate)
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo)
 {
     SimulationWorkload simulationWorkload;
     simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
@@ -75,7 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
     simulationWorkload.useGpuUpdate    = useGpuForUpdate;
     simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
                                          && !simulationWorkload.computeNonbondedAtMtsLevel1;
-    simulationWorkload.useGpuHaloExchange = devFlags.enableGpuHaloExchange;
+    simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
     simulationWorkload.useGpuPmePpCommunication =
             devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
     simulationWorkload.useGpuDirectCommunication =
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h
index 5cde42e10c..c18f9a04d1 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -67,6 +67,7 @@ struct DevelopmentFeatureFlags;
  * \param[in] useGpuForBonded    Whether bonded interactions are calculated on GPU(s).
  * \param[in] useGpuForUpdate    Whether coordinate update and constraint solving is performed on
  *                               GPU(s).
+ * \param[in] useGpuDirectHalo   Whether halo exchange is performed directly between GPUs.
  * \returns Simulation lifetime constant workload description.
  */
 SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
@@ -75,7 +76,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool                           useGpuForNonbonded,
                                             PmeRunMode                     pmeRunMode,
                                             bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate);
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo);
 
 } // namespace gmx
 
-- 
2.22.0