From 2e4259ae7c3f011b30fb03f14b3b9426ce39e088 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Wed, 23 Oct 2019 19:17:46 +0200
Subject: [PATCH] Extend SimulationWorkload with CPU flags

Added flags for PME and Nonbondeds to indicate whether there is CPU
workload; this is useful as the lack of GPU work does not imply the
existence of CPU work.

Also made createSimulationWorkload() take the PME runmode class enum
instead of a bunch of bools.

Made some naming consistency improvements.

Refs #3181

Change-Id: I66233f1c790fc5092fb1babaed2ec3ebf16416de
---
 src/gromacs/mdlib/sim_util.cpp                | 20 +++++-----
 src/gromacs/mdrun/md.cpp                      |  2 +-
 src/gromacs/mdrun/runner.cpp                  |  3 +-
 src/gromacs/mdtypes/simulation_workload.h     | 10 +++--
 .../decidesimulationworkload.cpp              | 39 ++++++++++---------
 .../taskassignment/decidesimulationworkload.h | 20 +++++-----
 6 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index f9fd797164..249af42f5f 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -832,8 +832,8 @@ setupStepWorkload(const int                 legacyFlags,
     // on virial steps the CPU reduction path is taken
     // TODO: remove flags.computeEnergy, ref #3128
     flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
-    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu &&
-                                                           (rankHasPmeDuty || simulationWork.useGpuPmePPCommunication));
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.useGpuPme &&
+                                                           (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication));
 
     return flags;
 }
@@ -943,7 +943,7 @@ void do_force(FILE                                     *fplog,
     const StepWorkload &stepWork = runScheduleWork->stepWork;
 
 
-    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
     const int  pmeFlags            = makePmeFlags(stepWork);
 
     // Switches on whether to use GPU for position and force buffer operations
@@ -1037,12 +1037,12 @@ void do_force(FILE                                     *fplog,
          * and domain decomposition does not use the graph,
          * we do not need to worry about shifting.
          */
-        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePPCommunication && (stepWork.doNeighborSearch);
-        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePPCommunication && !(stepWork.doNeighborSearch);
+        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
+        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
         gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
                                  lambda[efptCOUL], lambda[efptVDW],
                                  (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePPCommunication, reinitGpuPmePpComms,
+                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                  sendCoordinatesFromGpu, wcycle);
     }
 #endif /* GMX_MPI */
@@ -1673,12 +1673,12 @@ void do_force(FILE                                     *fplog,
 
     // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
     // TODO refoactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePPCommunication, stepWork.useGpuPmeFReduction, wcycle);
+        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePpCommunication, stepWork.useGpuPmeFReduction, wcycle);
     }
 
 
@@ -1791,13 +1791,13 @@ void do_force(FILE                                     *fplog,
     }
 
     // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
         pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePPCommunication, false, wcycle);
+                               simulationWork.useGpuPmePpCommunication, false, wcycle);
     }
 
     if (stepWork.computeForces)
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 803dfb1103..db4548c946 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -331,7 +331,7 @@ void gmx::LegacySimulator::do_md()
 //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
 //          stream owned by the StatePropagatorDataGpu
     const auto &simulationWork     = runScheduleWork->simulationWork;
-    const bool  useGpuForPme       = simulationWork.usePmeGpu;
+    const bool  useGpuForPme       = simulationWork.useGpuPme;
     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 5b7788ee67..bf6532b3e0 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1593,8 +1593,7 @@ int Mdrunner::mdrunner()
         MdrunScheduleWorkload runScheduleWork;
         // Also populates the simulation constant workload description.
         runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
-                                                                  useGpuForPme,
-                                                                  (pmeRunMode == PmeRunMode::GPU),
+                                                                  pmeRunMode,
                                                                   useGpuForBonded,
                                                                   useGpuForUpdate,
                                                                   devFlags.enableGpuBufferOps,
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index 147199771c..a95ae9591b 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -134,12 +134,16 @@ class DomainLifetimeWorkload
 class SimulationWorkload
 {
     public:
+        //! If we have calculation of short range nonbondeds on CPU
+        bool useCpuNonbonded           = false;
         //! If we have calculation of short range nonbondeds on GPU
         bool useGpuNonbonded           = false;
         //! If we have calculation of long range PME in GPU
-        bool usePmeGpu                 = false;
+        bool useCpuPme                 = false;
+        //! If we have calculation of long range PME in GPU
+        bool useGpuPme                 = false;
         //! If PME FFT solving is done on GPU.
-        bool usePmeFftGpu              = false;
+        bool useGpuPmeFft              = false;
         //! If bonded interactions are calculated on GPU.
         bool useGpuBonded              = false;
         //! If update and constraint solving is performed on GPU.
@@ -149,7 +153,7 @@ class SimulationWorkload
         //! If domain decomposition halo exchange is performed on GPU.
         bool useGpuHaloExchange        = false;
         //! If direct PP-PME communication between GPU is used.
-        bool useGpuPmePPCommunication  = false;
+        bool useGpuPmePpCommunication  = false;
         //! If direct GPU-GPU communication is enabled.
         bool useGpuDirectCommunication = false;
 };
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp
index 17c3dc06af..597cf1a815 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -43,32 +43,33 @@
 
 #include "decidesimulationworkload.h"
 
+#include "gromacs/ewald/pme.h"
 #include "gromacs/taskassignment/taskassignment.h"
 #include "gromacs/utility/arrayref.h"
 
 namespace gmx
 {
 
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm)
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdateConstraints,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm)
 {
-    SimulationWorkload simulationWorkload {
-        useGpuForNonbonded,
-        useGpuForPme,
-        useGpuForPmeFft,
-        useGpuForBonded,
-        useGpuForUpdateConstraints,
-        useGpuForBufferOps,
-        useGpuHaloExchange,
-        useGpuPmePpComm,
-        useGpuHaloExchange || useGpuPmePpComm
-    };
+    SimulationWorkload simulationWorkload;
+    simulationWorkload.useCpuNonbonded           = !useGpuForNonbonded;
+    simulationWorkload.useGpuNonbonded           = useGpuForNonbonded;
+    simulationWorkload.useCpuPme                 = (pmeRunMode == PmeRunMode::CPU);
+    simulationWorkload.useGpuPme                 = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuPmeFft              = (pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuBonded              = useGpuForBonded;
+    simulationWorkload.useGpuUpdate              = useGpuForUpdateConstraints;
+    simulationWorkload.useGpuBufferOps           = useGpuForBufferOps;
+    simulationWorkload.useGpuHaloExchange        = useGpuHaloExchange;
+    simulationWorkload.useGpuPmePpCommunication  = useGpuPmePpComm;
+    simulationWorkload.useGpuDirectCommunication = useGpuHaloExchange || useGpuPmePpComm;
 
     return simulationWorkload;
 }
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h
index 664a307b93..0e87da68c5 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -46,6 +46,8 @@
 
 #include "gromacs/mdtypes/simulation_workload.h"
 
+enum class PmeRunMode;
+
 namespace gmx
 {
 
@@ -55,8 +57,7 @@ namespace gmx
  *
  * \param[in] useGpuForNonbonded If we have short-range nonbonded interactions
  *                               calculations on GPU(s).
- * \param[in] useGpuForPme       If long range PME interactions are calculated on GPU(s).
- * \param[in] useGpuForPmeFft    If FFT solving for PME is done on the GPU.
+ * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
  * \param[in] useGpuForBonded    If bonded interactions are calculated on GPU(s).
  * \param[in] useGpuForUpdateConstraints If coordinate update and constraint solving is performed on
  *                                       GPU(s).
@@ -65,14 +66,13 @@ namespace gmx
  * \param[in] useGpuPmePpComm    If GPu direct communication is used in PME-PP communication.
  * \returns Simulation lifetime constant workload description.
  */
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm);
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdateConstraints,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm);
 
 
 }  // namespace gmx
-- 
2.22.0