Move buffer ops / PME F reduction flags into StepWorkload
authorSzilárd Páll <pall.szilard@gmail.com>
Wed, 9 Oct 2019 00:53:24 +0000 (02:53 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Sun, 13 Oct 2019 11:10:20 +0000 (13:10 +0200)
Also moved overrides conditions for when buffer ops can not be offloaded
into the DevelopmentFeatureFlags data structure initialization, the
initialization of which had to be shifted so this code can be passed the
task assigment decision on nonbonded offload.

Change-Id: Ib6850bcf306a70bbd9557cf2d5c2b1e39159e566

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/simulation_workload.h

index 8386e341393cb5b6050b380fd9836c92636bc4b9..32c30aab8f86de8c2221f5ab43f3e31ed3637b2e 100644 (file)
@@ -613,17 +613,15 @@ static int makePmeFlags(const StepWorkload &stepWork)
  * \param[in]  box                  The box matrix
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  pmeFlags             PME flags
- * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
  * \param[in]  wcycle               The wallcycle structure
  */
 static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
                                       const matrix        box,
                                       const StepWorkload &stepWork,
                                       int                 pmeFlags,
-                                      bool                useGpuForceReduction,
                                       gmx_wallcycle_t     wcycle)
 {
-    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
     pme_gpu_launch_spread(pmedata, wcycle);
 }
 
@@ -795,11 +793,16 @@ setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
  *
  * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
  * \param[in]      isNonbondedOn        Global override, if false forces to turn off all nonbonded calculation.
+ * \param[in]      simulationWork       Simulation workload description.
+ * \param[in]      rankHasPmeDuty       If this rank computes PME.
+ *
  * \returns New Stepworkload description.
  */
 static StepWorkload
-setupStepWorkload(const int     legacyFlags,
-                  const bool    isNonbondedOn)
+setupStepWorkload(const int                 legacyFlags,
+                  const bool                isNonbondedOn,
+                  const SimulationWorkload &simulationWork,
+                  const bool                rankHasPmeDuty)
 {
     StepWorkload flags;
     flags.stateChanged           = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
@@ -811,6 +814,17 @@ setupStepWorkload(const int     legacyFlags,
     flags.computeListedForces    = ((legacyFlags & GMX_FORCE_LISTED) != 0);
     flags.computeNonbondedForces = ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && isNonbondedOn;
     flags.computeDhdl            = ((legacyFlags & GMX_FORCE_DHDL) != 0);
+
+    if (simulationWork.useGpuBufferOps)
+    {
+        GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded");
+    }
+    flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+    // on virial steps the CPU reduction path is taken
+    // TODO: remove flags.computeEnergy, ref #3128
+    flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu && rankHasPmeDuty);
+
     return flags;
 }
 
@@ -911,25 +925,22 @@ void do_force(FILE                                     *fplog,
         legacyFlags &= ~GMX_FORCE_NONBONDED;
     }
 
-    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded);
-    const StepWorkload       &stepWork = runScheduleWork->stepWork;
-
     const SimulationWorkload &simulationWork = runScheduleWork->simulationWork;
 
-    const bool                useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
-    const int                 pmeFlags            = makePmeFlags(stepWork);
+
+    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded,
+                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
+    const StepWorkload &stepWork = runScheduleWork->stepWork;
+
+
+    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const int  pmeFlags            = makePmeFlags(stepWork);
 
     // Switches on whether to use GPU for position and force buffer operations
     // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
-    const BufferOpsUseGpu useGpuXBufOps = (simulationWork.useGpuBufferOps &&
-                                           simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+    const BufferOpsUseGpu useGpuXBufOps = stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
     // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
-    const BufferOpsUseGpu useGpuFBufOps = ((simulationWork.useGpuBufferOps &&
-                                            simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) &&
-                                           !(stepWork.computeVirial || stepWork.computeEnergy)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-    // TODO: move / add this flag to the internal PME GPU data structures
-    const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-        useGpuPmeOnThisRank; // only supported if this rank is perfoming PME on the GPU
+    const BufferOpsUseGpu useGpuFBufOps = stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1019,7 +1030,7 @@ void do_force(FILE                                     *fplog,
 
     if (useGpuPmeOnThisRank)
     {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
     }
 
     /* do gridding for pair search */
@@ -1533,7 +1544,7 @@ void do_force(FILE                                     *fplog,
     }
 
     const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
-    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
     // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
     const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
                                          (fr->efep != efepNO));
@@ -1636,7 +1647,7 @@ void do_force(FILE                                     *fplog,
     {
         gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
 
-        if (useGpuPmeFReduction)
+        if (stepWork.useGpuPmeFReduction)
         {
             dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
         }
@@ -1679,7 +1690,7 @@ void do_force(FILE                                     *fplog,
                                               stateGpu->getForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
                                               dependencyList,
-                                              useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
+                                              stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
             // This function call synchronizes the local stream
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
             stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
index fdc93ac07e2827483542a22c9371b5977d4f2c8d..75dc6d6fea58e620db30f7957ee0545f6eb4f895 100644 (file)
@@ -194,14 +194,16 @@ struct DevelopmentFeatureFlags
  * Note that some development features overrides are applied already here:
  * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
  *
- * \param[in]  mdlog        Logger object.
- * \returns                 The object populated with development feature flags.
+ * \param[in]  mdlog                Logger object.
+ * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
+ * \returns                         The object populated with development feature flags.
  */
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog,
+                                                         const bool           useGpuForNonbonded)
 {
     DevelopmentFeatureFlags devFlags;
 
-    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
     devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
     devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
     devFlags.enableGpuPmePPComm    = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
@@ -683,9 +685,6 @@ int Mdrunner::mdrunner()
     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
     gmx::MDLogger    mdlog(logOwner.logger());
 
-    // report any development features that may be enabled by environment variables
-    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog);
-
     // TODO The thread-MPI master rank makes a working
     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
     // after the threads have been launched. This works because no use
@@ -845,6 +844,10 @@ int Mdrunner::mdrunner()
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
+    // Initialize development feature flags that enabled by environment variable
+    // and report those features that are enabled.
+    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded);
+
     // Build restraints.
     // TODO: hide restraint implementation details from Mdrunner.
     // There is nothing unique about restraints at this point as far as the
index e20bcc9e7cfedfb457ab646e025fe2f4973420d9..147199771cb7ed5d5ff396b7e90fb4c945a9d2c6 100644 (file)
@@ -77,6 +77,16 @@ class StepWorkload
         bool computeListedForces = false;
         //! Whether this step DHDL needs to be computed
         bool computeDhdl = false;
+        /*! \brief Whether coordinate buffer ops are done on the GPU this step
+         * \note This technically belongs to DomainLifetimeWorkload but due
+         * to needing the flag before DomainLifetimeWorkload is built we keep
+         * it here for now.
+         */
+        bool useGpuXBufferOps  = false;
+        //! Whether force buffer ops are done on the GPU this step
+        bool useGpuFBufferOps  = false;
+        //! Whether PME forces are reduced with other contributions on the GPU this step
+        bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
 };
 
 /*! \libinternal