Add PP decomposition simluationWorkload flags

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 12 Aug 2021 16:57:42 +0000 (18:57 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 18 Aug 2021 18:13:16 +0000 (20:13 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 12 Aug 2021 16:57:42 +0000 (18:57 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 18 Aug 2021 18:13:16 +0000 (20:13 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 8c4c86a52d750ca609fff405f8aed5f552daa5c8..bda10e7649fe2ccb76e040c05473478e1a853c55 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -900,12 +900,11 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
  
  /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
-                                                          const t_forcerec& fr,
-                                                          const pull_t*     pull_work,
-                                                          const gmx_edsam*  ed,
-                                                          const t_mdatoms&  mdatoms,
-                                                          bool havePPDomainDecomposition,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
+                                                          const t_forcerec&         fr,
+                                                          const pull_t*             pull_work,
+                                                          const gmx_edsam*          ed,
+                                                          const t_mdatoms&          mdatoms,
                                                            const SimulationWorkload& simulationWork,
                                                            const StepWorkload&       stepWork)
  {
@@ -938,7 +937,7 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inpu
              || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
              || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
      domainWork.haveLocalForceContribInCpuBuffer =
-            domainWork.haveCpuLocalForceWork || havePPDomainDecomposition;
+            domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
      domainWork.haveNonLocalForceContribInCpuBuffer =
              domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
  
@@ -989,8 +988,11 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
      flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
      flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                  && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo          = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo          = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    // Note that rankHasPmeDuty is used confusingly due to the way cr->duty is set up (can be true even for non-PME runs),
+    // but the haveGpuPmeOnThisRank still ends up correct as simulationWork.useGpuPme == false in such cases.
+    // TODO: improve this when duty-reliance is eliminated
      flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces;
      flags.combineMtsForcesBeforeHaloExchange =
              (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
@@ -1131,8 +1133,8 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
      // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+    const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
+                            || runScheduleWork->simulationWork.havePpDomainDecomposition;
      const int atomStart = 0;
      fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
                                                              nbv->getNumAtoms(AtomLocality::Local),
@@ -1168,7 +1170,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
      {
          // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
-        if (!havePPDomainDecomposition(cr))
+        if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
          {
              const bool useGpuForceBufferOps = true;
              fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
@@ -1182,7 +1184,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                  cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (runScheduleWork->simulationWork.havePpDomainDecomposition)
      {
          // (re-)initialize non-local GPU force reduction
          const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
@@ -1302,6 +1304,11 @@ void do_force(FILE*                               fplog,
                                                  AtomLocality::Local, simulationWork, stepWork)
                                          : nullptr;
  
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
      // virial, or if host-side data will be transferred from this task
@@ -1310,17 +1317,9 @@ void do_force(FILE*                               fplog,
      // hence copy is not needed.
      const bool haveHostPmePpComms =
              !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
          && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
+            || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
          haveCopiedXFromGpu = true;
@@ -1336,7 +1335,7 @@ void do_force(FILE*                               fplog,
          {
              // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
-                             getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)));
+                             getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
              if (stepWork.haveGpuPmeOnThisRank)
              {
                  // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
@@ -1464,7 +1463,7 @@ void do_force(FILE*                               fplog,
          // Need to run after the GPU-offload bonded interaction lists
          // are set up to be able to determine whether there is bonded work.
          runScheduleWork->domainWork = setupDomainLifetimeWorkload(
-                inputrec, *fr, pull_work, ed, *mdatoms, havePPDomainDecomposition(cr), simulationWork, stepWork);
+                inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
  
          wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
          wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
@@ -1523,7 +1522,7 @@ void do_force(FILE*                               fplog,
  
          // bonded work not split into separate local and non-local, so with DD
          // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+        if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
          {
              fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
          }
@@ -1550,7 +1549,7 @@ void do_force(FILE*                               fplog,
  
      /* Communicate coordinates and sum dipole if necessary +
         do non-local pair search */
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          if (stepWork.doNeighborSearch)
          {
@@ -1646,7 +1645,7 @@ void do_force(FILE*                               fplog,
          wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
          wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
  
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
          }
@@ -1672,7 +1671,7 @@ void do_force(FILE*                               fplog,
      {
          const bool needCoordsOnHost  = (runScheduleWork->domainWork.haveCpuLocalForceWork
                                         || stepWork.computeVirial || simulationWork.computeMuTot);
-        const bool haveAlreadyWaited = haveHostHaloExchangeComms;
+        const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange;
          if (needCoordsOnHost && !haveAlreadyWaited)
          {
              GMX_ASSERT(haveCopiedXFromGpu,
@@ -1736,7 +1735,7 @@ void do_force(FILE*                               fplog,
       * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
       */
      ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
-            &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle);
+            &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
  
      // Force output for MTS combined forces, only set at level1 MTS steps
      std::optional<ForceOutputs> forceOutMts =
@@ -1745,7 +1744,7 @@ void do_force(FILE*                               fplog,
                                                        forceView->forceMtsCombinedWithPadding(),
                                                        domainWork,
                                                        stepWork,
-                                                      havePPDomainDecomposition(cr),
+                                                      simulationWork.havePpDomainDecomposition,
                                                        wcycle))
                      : std::nullopt;
  
@@ -1807,7 +1806,7 @@ void do_force(FILE*                               fplog,
                  stepWork,
                  nrnb);
  
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              nbv->dispatchFreeEnergyKernel(
                      InteractionLocality::NonLocal,
@@ -1838,7 +1837,7 @@ void do_force(FILE*                               fplog,
  
      if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
      {
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
          }
@@ -2008,7 +2007,7 @@ void do_force(FILE*                               fplog,
                           ed,
                           stepWork.doNeighborSearch);
  
-    if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo
+    if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
          && domainWork.haveCpuLocalForceWork)
      {
          stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
@@ -2026,7 +2025,7 @@ void do_force(FILE*                               fplog,
          auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
  
          /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              if (simulationWork.useGpuNonbonded)
              {
@@ -2082,13 +2081,13 @@ void do_force(FILE*                               fplog,
       */
      if (stepWork.combineMtsForcesBeforeHaloExchange)
      {
-        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)),
+        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
                           force.unpaddedArrayRef(),
                           forceView->forceMtsCombined(),
                           inputrec.mtsLevels[1].stepFactor);
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          /* We are done with the CPU compute.
           * We will now communicate the non-local forces.
@@ -2215,8 +2214,8 @@ void do_force(FILE*                               fplog,
  
      // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && simulationWork.useGpuPmePpCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+        && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
@@ -2255,7 +2254,8 @@ void do_force(FILE*                               fplog,
                  // CPU force H2D with GPU force tasks on all streams including those in the
                  // local stream which would otherwise be implicit dependencies for the
                  // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+                auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
+                                                                         : AtomLocality::All;
  
                  stateGpu->copyForcesToGpu(forceWithShift, locality);
              }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index ec71586d46c8f53936fceaf9d0d5b35a32d836f1..e3eb7ff8f227766936c457fa2636eca9d310aa18 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1443,6 +1443,7 @@ int Mdrunner::mdrunner()
      runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
                                                                disableNonbondedCalculation,
                                                                devFlags,
+                                                              havePPDomainDecomposition(cr),
                                                                useGpuForNonbonded,
                                                                pmeRunMode,
                                                                useGpuForBonded,
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index 0d7ffe82255d5cf03a7187a92ce9adaf7999a73b..256d8366b88d23b96f6a9c14c657fa1736708d67 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -180,6 +180,10 @@ public:
      bool useGpuUpdate = false;
      //! If buffer operations are performed on GPU.
      bool useGpuBufferOps = false;
+    //! If PP domain decomposition is active.
+    bool havePpDomainDecomposition = false;
+    //! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication).
+    bool useCpuHaloExchange = false;
      //! If domain decomposition halo exchange is performed on GPU.
      bool useGpuHaloExchange = false;
      //! If direct PP-PME communication between GPU is used.
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp

index bd0c5a55a6c555b648f12ec9df4490f08d3d90c8..e21c8580ade5f573e58a79e3cb0fa08732ce6de4 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -55,11 +55,12 @@ namespace gmx
  SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              const bool        disableNonbondedCalculation,
                                              const DevelopmentFeatureFlags& devFlags,
-                                            bool                           useGpuForNonbonded,
-                                            PmeRunMode                     pmeRunMode,
-                                            bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate,
-                                            bool                           useGpuDirectHalo)
+                                            bool       havePpDomainDecomposition,
+                                            bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuDirectHalo)
  {
      SimulationWorkload simulationWorkload;
      simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
@@ -76,7 +77,9 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
      simulationWorkload.useGpuUpdate    = useGpuForUpdate;
      simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
                                           && !simulationWorkload.computeNonbondedAtMtsLevel1;
-    simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
+    simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
+    simulationWorkload.useCpuHaloExchange        = havePpDomainDecomposition && !useGpuDirectHalo;
+    simulationWorkload.useGpuHaloExchange        = useGpuDirectHalo;
      simulationWorkload.useGpuPmePpCommunication =
              devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
      simulationWorkload.useGpuDirectCommunication =
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h

index c18f9a04d184b852fa84c579855e227aa111fd05..e56e543d61776c0acc94200584eaaf9a08c3df03 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -61,6 +61,7 @@ struct DevelopmentFeatureFlags;
   * \param[in] inputrec           The input record
   * \param[in] disableNonbondedCalculation  Disable calculation of nonbonded forces
   * \param[in] devFlags           The development feature flags
+ * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run.
   * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions
   *                               calculations on GPU(s).
   * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
@@ -73,11 +74,12 @@ struct DevelopmentFeatureFlags;
  SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              bool              disableNonbondedCalculation,
                                              const DevelopmentFeatureFlags& devFlags,
-                                            bool                           useGpuForNonbonded,
-                                            PmeRunMode                     pmeRunMode,
-                                            bool                           useGpuForBonded,
-                                            bool                           useGpuForUpdate,
-                                            bool                           useGpuDirectHalo);
+                                            bool       havePpDomainDecomposition,
+                                            bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuDirectHalo);
  
  } // namespace gmx
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 12 Aug 2021 16:57:42 +0000 (18:57 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 18 Aug 2021 18:13:16 +0000 (20:13 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.h		patch \| blob \| history