Use workload data structures for GPU halo exchange triggers

author Alan Gray <alangray3@gmail.com>

Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
author Alan Gray <alangray3@gmail.com>
Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 2af9a25280dafb8e2baa4b3bb74ace3919039c2e..ef27166978b2d7ebeb9f5a781e211fa7d18ebaa2 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -911,6 +911,8 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
      flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
      flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                  && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
  
      return flags;
  }
@@ -1037,12 +1039,10 @@ static void combineMtsForces(const int      numAtoms,
   * \param [in] runScheduleWork               Schedule workload flag structure
   * \param [in] cr                            Communication record object
   * \param [in] fr                            Force record object
- * \param [in] ddUsesGpuDirectCommunication  Whether GPU direct communication is in use
   */
  static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                                      const t_commrec*            cr,
-                                    t_forcerec*                 fr,
-                                    bool                        ddUsesGpuDirectCommunication)
+                                    t_forcerec*                 fr)
  {
  
      nonbonded_verlet_t*          nbv      = fr->nbv.get();
@@ -1075,13 +1075,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      }
  
      if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !ddUsesGpuDirectCommunication)
+        && !runScheduleWork->simulationWork.useGpuHaloExchange)
      {
          fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                  stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
      }
  
-    if (ddUsesGpuDirectCommunication)
+    if (runScheduleWork->simulationWork.useGpuHaloExchange)
      {
          fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
                  cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
@@ -1246,17 +1246,6 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    // TODO Update this comment when introducing SimulationWorkload
-    //
-    // The conditions for gpuHaloExchange e.g. using GPU buffer
-    // operations were checked before construction, so here we can
-    // just use it and assert upon any conditions.
-    const bool ddUsesGpuDirectCommunication =
-            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty()));
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
-               "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
-
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
      // virial, or if host-side data will be transferred from this task
@@ -1265,7 +1254,12 @@ void do_force(FILE*                               fplog,
      // hence copy is not needed.
      const bool haveHostPmePpComms =
              !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-    const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+    const bool haveHostHaloExchangeComms =
+            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
  
      bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
@@ -1383,7 +1377,7 @@ void do_force(FILE*                               fplog,
  
          if (simulationWork.useGpuBufferOps)
          {
-            setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
+            setupGpuForceReductions(runScheduleWork, cr, fr);
          }
      }
      else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
@@ -1466,14 +1460,14 @@ void do_force(FILE*                               fplog,
              // to location in do_md where GPU halo exchange is
              // constructed at partitioning, after above stateGpu
              // re-initialization has similarly been refactored
-            if (ddUsesGpuDirectCommunication)
+            if (simulationWork.useGpuHaloExchange)
              {
                  reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
              }
          }
          else
          {
-            if (ddUsesGpuDirectCommunication)
+            if (stepWork.useGpuXHalo)
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
@@ -1496,7 +1490,7 @@ void do_force(FILE*                               fplog,
  
              if (stepWork.useGpuXBufferOps)
              {
-                if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
+                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                  }
@@ -1706,7 +1700,7 @@ void do_force(FILE*                               fplog,
      }
  
      // TODO Force flags should include haveFreeEnergyWork for this domain
-    if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
+    if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
      {
          /* Wait for non-local coordinate data to be copied from device */
          stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
@@ -1792,7 +1786,7 @@ void do_force(FILE*                               fplog,
  
      GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
                 "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
-    GMX_ASSERT(!(nonbondedAtMtsLevel1 && useGpuForcesHaloExchange),
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
                 "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
      // Will store the amount of cycles spent waiting for the GPU that
      // will be later used in the DLB accounting.
@@ -1832,9 +1826,10 @@ void do_force(FILE*                               fplog,
                                                AtomLocality::NonLocal);
                  }
  
+
                  fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
  
-                if (!useGpuForcesHaloExchange)
+                if (!stepWork.useGpuFHalo)
                  {
                      // copy from GPU input for dd_move_f()
                      stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
@@ -1878,7 +1873,8 @@ void do_force(FILE*                               fplog,
  
          if (stepWork.computeForces)
          {
-            if (useGpuForcesHaloExchange)
+
+            if (stepWork.useGpuFHalo)
              {
                  if (domainWork.haveCpuLocalForceWork)
                  {
@@ -2004,7 +2000,7 @@ void do_force(FILE*                               fplog,
              // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
              //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
              //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
+            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
              {
                  // Note: AtomLocality::All is used for the non-DD case because, as in this
                  // case copyForcesToGpu() uses a separate stream, it allows overlap of
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index e2f5581f3a7f74f7258c797e2b4194051a5f666f..bf7aa914beecb3a6b20348dcadd6c8c84070a851 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -850,7 +850,7 @@ void gmx::LegacySimulator::do_md()
          }
  
          // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded)
+        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
          {
              GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
                                 "GPU device manager has to be initialized to use GPU "
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index d32550e1824d140e3face616f8364fdeeb77db8a..b22881a05d42ca286dce9f4f53cdbdd040a42386 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1259,10 +1259,15 @@ int Mdrunner::mdrunner()
      }
  
      MdrunScheduleWorkload runScheduleWork;
+
+    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
+            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
+            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
+
      // Also populates the simulation constant workload description.
-    runScheduleWork.simulationWork =
-            createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags,
-                                     useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate);
+    runScheduleWork.simulationWork = createSimulationWorkload(
+            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
+            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
  
      std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
  
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index f2d08c584758676ee3c72c0ea29b94e7bc5c650b..c24ba0724c5ade8e4b5d04594d49246086d0526a 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -93,6 +93,10 @@ public:
      bool useGpuFBufferOps = false;
      //! Whether PME forces are reduced with other contributions on the GPU this step
      bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
+    //! Whether GPU coordinates halo exchange is active this step
+    bool useGpuXHalo = false;
+    //! Whether GPU forces halo exchange is active this step
+    bool useGpuFHalo = false;
  };
  
  /*! \libinternal
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp

index 85dfe60e93f3be1aa472c0cc722ce71f8af9d937..be1657af8eaa54714c9b90240c3f6d3d3c241d90 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -714,4 +714,15 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
              || (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault));
  }
  
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization)
+{
+    return havePPDomainDecomposition && devFlags.enableGpuHaloExchange && useGpuForNonbonded
+           && !useModularSimulator && !doRerun && !haveEnergyMinimization;
+}
+
  } // namespace gmx
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h

index 5d1524441662b39f6829519c8dfd11bce768408c..b151c5d6eae0775db7756930d4b99e160beaedce 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -301,6 +301,24 @@ bool decideWhetherToUseGpuForUpdate(bool                           isDomainDecom
                                      const gmx::MDLogger&           mdlog);
  
  
+/*! \brief Decide whether to use GPU for halo exchange.
+ *
+ * \param[in]  devFlags                     GPU development / experimental feature flags.
+ * \param[in]  havePPDomainDecomposition    Whether PP domain decomposition is in use.
+ * \param[in]  useGpuForNonbonded           Whether GPUs will be used for nonbonded interactions.
+ * \param[in]  useModularSimulator          Whether modularsimulator is in use.
+ * \param[in]  doRerun                      Whether this is a rerun.
+ * \param[in]  haveEnergyMinimization       Whether energy minimization is in use.
+ *
+ * \returns    Whether halo exchange can be run on GPU.
+ */
+bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags,
+                                  bool                           havePPDomainDecomposition,
+                                  bool                           useGpuForNonbonded,
+                                  bool                           useModularSimulator,
+                                  bool                           doRerun,
+                                  bool                           haveEnergyMinimization);
+
  } // namespace gmx
  
  #endif
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp

index 06b1eeb29d2405228c5091997f013546adb1f9c2..5c5cdaeb3f606e453b68bdf9833152407f4d921f 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -58,7 +58,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              bool                           useGpuForNonbonded,
                                              PmeRunMode                     pmeRunMode,
                                              bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate)
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo)
  {
      SimulationWorkload simulationWorkload;
      simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
@@ -75,7 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
      simulationWorkload.useGpuUpdate    = useGpuForUpdate;
      simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
                                           && !simulationWorkload.computeNonbondedAtMtsLevel1;
-    simulationWorkload.useGpuHaloExchange = devFlags.enableGpuHaloExchange;
+    simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
      simulationWorkload.useGpuPmePpCommunication =
              devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
      simulationWorkload.useGpuDirectCommunication =
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h

index 5cde42e10cbd150bf9910ae41a40830f1e9b64a1..c18f9a04d184b852fa84c579855e227aa111fd05 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -67,6 +67,7 @@ struct DevelopmentFeatureFlags;
   * \param[in] useGpuForBonded    Whether bonded interactions are calculated on GPU(s).
   * \param[in] useGpuForUpdate    Whether coordinate update and constraint solving is performed on
   *                               GPU(s).
+ * \param[in] useGpuDirectHalo   Whether halo exchange is performed directly between GPUs.
   * \returns Simulation lifetime constant workload description.
   */
  SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
@@ -75,7 +76,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              bool                           useGpuForNonbonded,
                                              PmeRunMode                     pmeRunMode,
                                              bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate);
+                                            bool                           useGpuForUpdate,
+                                            bool                           useGpuDirectHalo);
  
  } // namespace gmx
author	Alan Gray <alangray3@gmail.com>
	Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 2 Oct 2020 07:02:31 +0000 (07:02 +0000)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.h		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.h		patch \| blob \| history