From d41b5ee390b9bf44a68e7d038cc38d8c5a263996 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 19 Aug 2021 13:16:27 +0000
Subject: [PATCH] Add separate PME rank SimluationWorkload flags

Added the complete set of flags that indicate whether separate PME ranks
are used and what type of PP-PME communication is used.

These flags allow simplifications in the force schedule as well as reducing
the use of the confusingly defined cr->duty for now in do_force().

The additional benefit is a reduced reliance on passing around commrec
for checks related to parallelization.

Refs #3913
---
 src/gromacs/mdlib/sim_util.cpp                | 32 +++++++++----------
 src/gromacs/mdrun/runner.cpp                  |  5 +++
 src/gromacs/mdtypes/simulation_workload.h     |  4 +++
 .../decidesimulationworkload.cpp              | 13 +++++++-
 .../taskassignment/decidesimulationworkload.h |  2 ++
 5 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index bda10e7649..51777faf13 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1147,19 +1147,19 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
 
     if (runScheduleWork->simulationWork.useGpuPme
-        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+        && (!runScheduleWork->simulationWork.haveSeparatePmeRank
+            || runScheduleWork->simulationWork.useGpuPmePpCommunication))
     {
         DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+                runScheduleWork->simulationWork.haveSeparatePmeRank
+                        ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
+                        : pme_gpu_get_device_f(fr->pmedata);        // PME force buffer on same GPU
         fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
 
         GpuEventSynchronizer* const pmeSynchronizer =
-                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                               : // PME force buffer on same GPU
-                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-
+                (runScheduleWork->simulationWork.haveSeparatePmeRank
+                         ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU
+                         : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU
         if (GMX_THREAD_MPI)
         {
             GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
@@ -1315,11 +1315,10 @@ void do_force(FILE*                               fplog,
     // to a remote task for halo exchange or PME-PP communication. At
     // search steps the current coordinates are already on the host,
     // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
         && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
+            || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
+            || simulationWork.computeMuTot))
     {
         stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
         haveCopiedXFromGpu = true;
@@ -1352,7 +1351,7 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    if (!thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces)
+    if (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
     {
         /* Send particle coordinates to the pme nodes */
         if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
@@ -1709,7 +1708,7 @@ void do_force(FILE*                               fplog,
     /* Reset energies */
     reset_enerdata(enerd);
 
-    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
+    if (DOMAINDECOMP(cr) && simulationWork.haveSeparatePmeRank)
     {
         wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
         dd_force_flop_start(cr->dd, nrnb);
@@ -2214,7 +2213,7 @@ void do_force(FILE*                               fplog,
 
     // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
     // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useGpuPmePpCommunication
         && stepWork.computeSlowForces)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
@@ -2272,7 +2271,8 @@ void do_force(FILE*                               fplog,
             // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
             //       they should not be copied in do_md(...) for the output.
             if (!simulationWork.useGpuUpdate
-                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
+                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && simulationWork.useCpuPmePpCommunication)
+                || vsite)
             {
                 stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                 stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
@@ -2308,7 +2308,7 @@ void do_force(FILE*                               fplog,
     }
 
     // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useCpuPmePpCommunication
         && stepWork.computeSlowForces)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 2462285bef..64ada7d8d8 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1453,10 +1453,15 @@ int Mdrunner::mdrunner()
                                                          EI_ENERGY_MINIMIZATION(inputrec->eI));
 
     // Also populates the simulation constant workload description.
+    // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+    // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+    // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+    const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
     runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
                                                               disableNonbondedCalculation,
                                                               devFlags,
                                                               havePPDomainDecomposition(cr),
+                                                              haveSeparatePmeRank,
                                                               useGpuForNonbonded,
                                                               pmeRunMode,
                                                               useGpuForBonded,
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index a00197dc18..1a2106e2b2 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -187,6 +187,10 @@ public:
     bool useCpuHaloExchange = false;
     //! If domain decomposition halo exchange is performed on GPU.
     bool useGpuHaloExchange = false;
+    //! If separate PME rank(s) are used.
+    bool haveSeparatePmeRank = false;
+    //! If PP-PME communication is done purely on CPU (in CPU-only runs or with staged GPU communication).
+    bool useCpuPmePpCommunication = false;
     //! If direct PP-PME communication between GPU is used.
     bool useGpuPmePpCommunication = false;
     //! If direct GPU-GPU communication is enabled.
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp
index e21c8580ad..f3a3a5cdec 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -56,6 +56,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             const bool        disableNonbondedCalculation,
                                             const DevelopmentFeatureFlags& devFlags,
                                             bool       havePpDomainDecomposition,
+                                            bool       haveSeparatePmeRank,
                                             bool       useGpuForNonbonded,
                                             PmeRunMode pmeRunMode,
                                             bool       useGpuForBonded,
@@ -80,8 +81,18 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
     simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
     simulationWorkload.useCpuHaloExchange        = havePpDomainDecomposition && !useGpuDirectHalo;
     simulationWorkload.useGpuHaloExchange        = useGpuDirectHalo;
+    if (pmeRunMode == PmeRunMode::None)
+    {
+        GMX_RELEASE_ASSERT(!haveSeparatePmeRank, "Can not have separate PME rank(s) without PME.");
+    }
+    simulationWorkload.haveSeparatePmeRank = haveSeparatePmeRank;
     simulationWorkload.useGpuPmePpCommunication =
-            devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+            haveSeparatePmeRank && devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+    simulationWorkload.useCpuPmePpCommunication =
+            haveSeparatePmeRank && !simulationWorkload.useGpuPmePpCommunication;
+    GMX_RELEASE_ASSERT(!(simulationWorkload.useGpuPmePpCommunication
+                         && simulationWorkload.useCpuPmePpCommunication),
+                       "Cannot do PME-PP communication on both CPU and GPU");
     simulationWorkload.useGpuDirectCommunication =
             devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm;
     simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution(inputrec);
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h
index e56e543d61..3ea543e562 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -62,6 +62,7 @@ struct DevelopmentFeatureFlags;
  * \param[in] disableNonbondedCalculation  Disable calculation of nonbonded forces
  * \param[in] devFlags           The development feature flags
  * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run.
+ * \param[in] haveSeparatePmeRank Whether separate PME rank(s) are used in this run.
  * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions
  *                               calculations on GPU(s).
  * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
@@ -75,6 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool              disableNonbondedCalculation,
                                             const DevelopmentFeatureFlags& devFlags,
                                             bool       havePpDomainDecomposition,
+                                            bool       haveSeparatePmeRank,
                                             bool       useGpuForNonbonded,
                                             PmeRunMode pmeRunMode,
                                             bool       useGpuForBonded,
-- 
2.22.0