From 78956ab8cc33987eacc995cf578e8869cbda5e6a Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Wed, 9 Oct 2019 02:53:24 +0200
Subject: [PATCH] Move buffer ops / PME F reduction flags into StepWorkload

Also moved overrides conditions for when buffer ops can not be offloaded
into the DevelopmentFeatureFlags data structure initialization, the
initialization of which had to be shifted so this code can be passed the
task assigment decision on nonbonded offload.

Change-Id: Ib6850bcf306a70bbd9557cf2d5c2b1e39159e566
---
 src/gromacs/mdlib/sim_util.cpp            | 55 ++++++++++++++---------
 src/gromacs/mdrun/runner.cpp              | 17 ++++---
 src/gromacs/mdtypes/simulation_workload.h | 10 +++++
 3 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 8386e34139..32c30aab8f 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -613,17 +613,15 @@ static int makePmeFlags(const StepWorkload &stepWork)
  * \param[in]  box                  The box matrix
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  pmeFlags             PME flags
- * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
  * \param[in]  wcycle               The wallcycle structure
  */
 static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
                                       const matrix        box,
                                       const StepWorkload &stepWork,
                                       int                 pmeFlags,
-                                      bool                useGpuForceReduction,
                                       gmx_wallcycle_t     wcycle)
 {
-    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
     pme_gpu_launch_spread(pmedata, wcycle);
 }
 
@@ -795,11 +793,16 @@ setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
  *
  * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
  * \param[in]      isNonbondedOn        Global override, if false forces to turn off all nonbonded calculation.
+ * \param[in]      simulationWork       Simulation workload description.
+ * \param[in]      rankHasPmeDuty       If this rank computes PME.
+ *
  * \returns New Stepworkload description.
  */
 static StepWorkload
-setupStepWorkload(const int     legacyFlags,
-                  const bool    isNonbondedOn)
+setupStepWorkload(const int                 legacyFlags,
+                  const bool                isNonbondedOn,
+                  const SimulationWorkload &simulationWork,
+                  const bool                rankHasPmeDuty)
 {
     StepWorkload flags;
     flags.stateChanged           = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
@@ -811,6 +814,17 @@ setupStepWorkload(const int     legacyFlags,
     flags.computeListedForces    = ((legacyFlags & GMX_FORCE_LISTED) != 0);
     flags.computeNonbondedForces = ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && isNonbondedOn;
     flags.computeDhdl            = ((legacyFlags & GMX_FORCE_DHDL) != 0);
+
+    if (simulationWork.useGpuBufferOps)
+    {
+        GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded");
+    }
+    flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+    // on virial steps the CPU reduction path is taken
+    // TODO: remove flags.computeEnergy, ref #3128
+    flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu && rankHasPmeDuty);
+
     return flags;
 }
 
@@ -911,25 +925,22 @@ void do_force(FILE                                     *fplog,
         legacyFlags &= ~GMX_FORCE_NONBONDED;
     }
 
-    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded);
-    const StepWorkload       &stepWork = runScheduleWork->stepWork;
-
     const SimulationWorkload &simulationWork = runScheduleWork->simulationWork;
 
-    const bool                useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
-    const int                 pmeFlags            = makePmeFlags(stepWork);
+
+    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded,
+                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
+    const StepWorkload &stepWork = runScheduleWork->stepWork;
+
+
+    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const int  pmeFlags            = makePmeFlags(stepWork);
 
     // Switches on whether to use GPU for position and force buffer operations
     // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
-    const BufferOpsUseGpu useGpuXBufOps = (simulationWork.useGpuBufferOps &&
-                                           simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+    const BufferOpsUseGpu useGpuXBufOps = stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
     // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
-    const BufferOpsUseGpu useGpuFBufOps = ((simulationWork.useGpuBufferOps &&
-                                            simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) &&
-                                           !(stepWork.computeVirial || stepWork.computeEnergy)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-    // TODO: move / add this flag to the internal PME GPU data structures
-    const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-        useGpuPmeOnThisRank; // only supported if this rank is perfoming PME on the GPU
+    const BufferOpsUseGpu useGpuFBufOps = stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1019,7 +1030,7 @@ void do_force(FILE                                     *fplog,
 
     if (useGpuPmeOnThisRank)
     {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
     }
 
     /* do gridding for pair search */
@@ -1533,7 +1544,7 @@ void do_force(FILE                                     *fplog,
     }
 
     const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
-    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
     // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
     const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
                                          (fr->efep != efepNO));
@@ -1636,7 +1647,7 @@ void do_force(FILE                                     *fplog,
     {
         gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
 
-        if (useGpuPmeFReduction)
+        if (stepWork.useGpuPmeFReduction)
         {
             dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
         }
@@ -1679,7 +1690,7 @@ void do_force(FILE                                     *fplog,
                                               stateGpu->getForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
                                               dependencyList,
-                                              useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
+                                              stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
             // This function call synchronizes the local stream
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
             stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index fdc93ac07e..75dc6d6fea 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -194,14 +194,16 @@ struct DevelopmentFeatureFlags
  * Note that some development features overrides are applied already here:
  * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
  *
- * \param[in]  mdlog        Logger object.
- * \returns                 The object populated with development feature flags.
+ * \param[in]  mdlog                Logger object.
+ * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
+ * \returns                         The object populated with development feature flags.
  */
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog,
+                                                         const bool           useGpuForNonbonded)
 {
     DevelopmentFeatureFlags devFlags;
 
-    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
     devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
     devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
     devFlags.enableGpuPmePPComm    = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
@@ -683,9 +685,6 @@ int Mdrunner::mdrunner()
     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
     gmx::MDLogger    mdlog(logOwner.logger());
 
-    // report any development features that may be enabled by environment variables
-    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog);
-
     // TODO The thread-MPI master rank makes a working
     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
     // after the threads have been launched. This works because no use
@@ -845,6 +844,10 @@ int Mdrunner::mdrunner()
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
+    // Initialize development feature flags that enabled by environment variable
+    // and report those features that are enabled.
+    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded);
+
     // Build restraints.
     // TODO: hide restraint implementation details from Mdrunner.
     // There is nothing unique about restraints at this point as far as the
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index e20bcc9e7c..147199771c 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -77,6 +77,16 @@ class StepWorkload
         bool computeListedForces = false;
         //! Whether this step DHDL needs to be computed
         bool computeDhdl = false;
+        /*! \brief Whether coordinate buffer ops are done on the GPU this step
+         * \note This technically belongs to DomainLifetimeWorkload but due
+         * to needing the flag before DomainLifetimeWorkload is built we keep
+         * it here for now.
+         */
+        bool useGpuXBufferOps  = false;
+        //! Whether force buffer ops are done on the GPU this step
+        bool useGpuFBufferOps  = false;
+        //! Whether PME forces are reduced with other contributions on the GPU this step
+        bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
 };
 
 /*! \libinternal
-- 
2.22.0