Use workload data structures for GPU halo exchange triggers
authorAlan Gray <alangray3@gmail.com>
Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
Move GPU halo exchange trigger booleans and related conditionals into
workload data structures, and remove unnecessary assertion on GPU
buffer ops being active (since it is now automatically activated when
GPU halo exchange is active).

Partly addresses #3370

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/simulation_workload.h
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h
src/gromacs/taskassignment/decidesimulationworkload.cpp
src/gromacs/taskassignment/decidesimulationworkload.h

index 2af9a25280dafb8e2baa4b3bb74ace3919039c2e..ef27166978b2d7ebeb9f5a781e211fa7d18ebaa2 100644 (file)
@@ -911,6 +911,8 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
     flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
     flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                 && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
 
     return flags;
 }
@@ -1037,12 +1039,10 @@ static void combineMtsForces(const int      numAtoms,
  * \param [in] runScheduleWork               Schedule workload flag structure
  * \param [in] cr                            Communication record object
  * \param [in] fr                            Force record object
- * \param [in] ddUsesGpuDirectCommunication  Whether GPU direct communication is in use
  */
 static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                                     const t_commrec*            cr,
-                                    t_forcerec*                 fr,
-                                    bool                        ddUsesGpuDirectCommunication)
+                                    t_forcerec*                 fr)
 {
 
     nonbonded_verlet_t*          nbv      = fr->nbv.get();
@@ -1075,13 +1075,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     }
 
     if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !ddUsesGpuDirectCommunication)
+        && !runScheduleWork->simulationWork.useGpuHaloExchange)
     {
         fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                 stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
     }
 
-    if (ddUsesGpuDirectCommunication)
+    if (runScheduleWork->simulationWork.useGpuHaloExchange)
     {
         fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                 cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
@@ -1246,17 +1246,6 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // TODO Update this comment when introducing SimulationWorkload
-    //
-    // The conditions for gpuHaloExchange e.g. using GPU buffer
-    // operations were checked before construction, so here we can
-    // just use it and assert upon any conditions.
-    const bool ddUsesGpuDirectCommunication =
-            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty()));
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
-               "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
-
     // Copy coordinate from the GPU if update is on the GPU and there
     // are forces to be computed on the CPU, or for the computation of
     // virial, or if host-side data will be transferred from this task
@@ -1265,7 +1254,12 @@ void do_force(FILE*                               fplog,
     // hence copy is not needed.
     const bool haveHostPmePpComms =
             !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-    const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+    const bool haveHostHaloExchangeComms =
+            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
 
     bool gmx_used_in_debug haveCopiedXFromGpu = false;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
@@ -1383,7 +1377,7 @@ void do_force(FILE*                               fplog,
 
         if (simulationWork.useGpuBufferOps)
         {
-            setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
+            setupGpuForceReductions(runScheduleWork, cr, fr);
         }
     }
     else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
@@ -1466,14 +1460,14 @@ void do_force(FILE*                               fplog,
             // to location in do_md where GPU halo exchange is
             // constructed at partitioning, after above stateGpu
             // re-initialization has similarly been refactored
-            if (ddUsesGpuDirectCommunication)
+            if (simulationWork.useGpuHaloExchange)
             {
                 reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
             }
         }
         else
         {
-            if (ddUsesGpuDirectCommunication)
+            if (stepWork.useGpuXHalo)
             {
                 // The following must be called after local setCoordinates (which records an event
                 // when the coordinate data has been copied to the device).
@@ -1496,7 +1490,7 @@ void do_force(FILE*                               fplog,
 
             if (stepWork.useGpuXBufferOps)
             {
-                if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
+                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
                 {
                     stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                 }
@@ -1706,7 +1700,7 @@ void do_force(FILE*                               fplog,
     }
 
     // TODO Force flags should include haveFreeEnergyWork for this domain
-    if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
+    if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
     {
         /* Wait for non-local coordinate data to be copied from device */
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
@@ -1792,7 +1786,7 @@ void do_force(FILE*                               fplog,
 
     GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
                "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && useGpuForcesHaloExchange),
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
                "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
@@ -1832,9 +1826,10 @@ void do_force(FILE*                               fplog,
                                               AtomLocality::NonLocal);
                 }
 
+
                 fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
 
-                if (!useGpuForcesHaloExchange)
+                if (!stepWork.useGpuFHalo)
                 {
                     // copy from GPU input for dd_move_f()
                     stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
@@ -1878,7 +1873,8 @@ void do_force(FILE*                               fplog,
 
         if (stepWork.computeForces)
         {
-            if (useGpuForcesHaloExchange)
+
+            if (stepWork.useGpuFHalo)
             {
                 if (domainWork.haveCpuLocalForceWork)
                 {
@@ -2004,7 +2000,7 @@ void do_force(FILE*                               fplog,
             // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
             //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
             //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
+            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
             {
                 // Note: AtomLocality::All is used for the non-DD case because, as in this
                 // case copyForcesToGpu() uses a separate stream, it allows overlap of
index e2f5581f3a7f74f7258c797e2b4194051a5f666f..bf7aa914beecb3a6b20348dcadd6c8c84070a851 100644 (file)
@@ -850,7 +850,7 @@ void gmx::LegacySimulator::do_md()
         }
 
         // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded)
+        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
         {
             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
                                "GPU device manager has to be initialized to use GPU "
index d32550e1824d140e3face616f8364fdeeb77db8a..b22881a05d42ca286dce9f4f53cdbdd040a42386 100644 (file)
@@ -1259,10 +1259,15 @@ int Mdrunner::mdrunner()
     }
 
     MdrunScheduleWorkload runScheduleWork;
+
+    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
+            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
+            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
+
     // Also populates the simulation constant workload description.
-    runScheduleWork.simulationWork =
-            createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags,
-                                     useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate);
+    runScheduleWork.simulationWork = createSimulationWorkload(
+            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
+            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
 
     std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
 
index f2d08c584758676ee3c72c0ea29b94e7bc5c650b..c24ba0724c5ade8e4b5d04594d49246086d0526a 100644 (file)
@@ -93,6 +93,10 @@ public:
     bool useGpuFBufferOps = false;
     //! Whether PME forces are reduced with other contributions on the GPU this step
     bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
+    //! Whether GPU coordinates halo exchange is active this step
+    bool useGpuXHalo = false;
+    //! Whether GPU forces halo exchange is active this step
+    bool useGpuFHalo = false;
 };
 
 /*! \libinternal
index 85dfe60e93f3be1aa472c0cc722ce71f8af9d937..be1657af8eaa54714c9b90240c3f6d3d3c241d90 100644 (file)
@@ -714,4 +714,15 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
             || (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault));
 }
 
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization)
+{
+    return havePPDomainDecomposition && devFlags.enableGpuHaloExchange && useGpuForNonbonded
+           && !useModularSimulator && !doRerun && !haveEnergyMinimization;
+}
+
 } // namespace gmx
index 5d1524441662b39f6829519c8dfd11bce768408c..b151c5d6eae0775db7756930d4b99e160beaedce 100644 (file)
@@ -301,6 +301,24 @@ bool decideWhetherToUseGpuForUpdate(bool                           isDomainDecom
                                     const gmx::MDLogger&           mdlog);
 
 
+/*! \brief Decide whether to use GPU for halo exchange.
+ *
+ * \param[in]  devFlags                     GPU development / experimental feature flags.
+ * \param[in]  havePPDomainDecomposition    Whether PP domain decomposition is in use.
+ * \param[in]  useGpuForNonbonded           Whether GPUs will be used for nonbonded interactions.
+ * \param[in]  useModularSimulator          Whether modularsimulator is in use.
+ * \param[in]  doRerun                      Whether this is a rerun.
+ * \param[in]  haveEnergyMinimization       Whether energy minimization is in use.
+ *
+ * \returns    Whether halo exchange can be run on GPU.
+ */
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization);
+
 } // namespace gmx
 
 #endif
index 06b1eeb29d2405228c5091997f013546adb1f9c2..5c5cdaeb3f606e453b68bdf9833152407f4d921f 100644 (file)
@@ -58,7 +58,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool                           useGpuForNonbonded,
                                             PmeRunMode                     pmeRunMode,
                                             bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate)
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo)
 {
     SimulationWorkload simulationWorkload;
     simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
@@ -75,7 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
     simulationWorkload.useGpuUpdate    = useGpuForUpdate;
     simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
                                          && !simulationWorkload.computeNonbondedAtMtsLevel1;
-    simulationWorkload.useGpuHaloExchange = devFlags.enableGpuHaloExchange;
+    simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
     simulationWorkload.useGpuPmePpCommunication =
             devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
     simulationWorkload.useGpuDirectCommunication =
index 5cde42e10cbd150bf9910ae41a40830f1e9b64a1..c18f9a04d184b852fa84c579855e227aa111fd05 100644 (file)
@@ -67,6 +67,7 @@ struct DevelopmentFeatureFlags;
  * \param[in] useGpuForBonded    Whether bonded interactions are calculated on GPU(s).
  * \param[in] useGpuForUpdate    Whether coordinate update and constraint solving is performed on
  *                               GPU(s).
+ * \param[in] useGpuDirectHalo   Whether halo exchange is performed directly between GPUs.
  * \returns Simulation lifetime constant workload description.
  */
 SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
@@ -75,7 +76,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                             bool                           useGpuForNonbonded,
                                             PmeRunMode                     pmeRunMode,
                                             bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate);
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo);
 
 } // namespace gmx