Add PP decomposition simluationWorkload flags

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 8c4c86a52d750ca609fff405f8aed5f552daa5c8..bda10e7649fe2ccb76e040c05473478e1a853c55 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -900,12 +900,11 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
  
  /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
-                                                          const t_forcerec& fr,
-                                                          const pull_t*     pull_work,
-                                                          const gmx_edsam*  ed,
-                                                          const t_mdatoms&  mdatoms,
-                                                          bool havePPDomainDecomposition,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
+                                                          const t_forcerec&         fr,
+                                                          const pull_t*             pull_work,
+                                                          const gmx_edsam*          ed,
+                                                          const t_mdatoms&          mdatoms,
                                                            const SimulationWorkload& simulationWork,
                                                            const StepWorkload&       stepWork)
  {
@@ -938,7 +937,7 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inpu
              || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
              || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
      domainWork.haveLocalForceContribInCpuBuffer =
-            domainWork.haveCpuLocalForceWork || havePPDomainDecomposition;
+            domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
      domainWork.haveNonLocalForceContribInCpuBuffer =
              domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
  
@@ -989,8 +988,11 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
      flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
      flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                  && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo          = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo          = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    // Note that rankHasPmeDuty is used confusingly due to the way cr->duty is set up (can be true even for non-PME runs),
+    // but the haveGpuPmeOnThisRank still ends up correct as simulationWork.useGpuPme == false in such cases.
+    // TODO: improve this when duty-reliance is eliminated
      flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces;
      flags.combineMtsForcesBeforeHaloExchange =
              (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
@@ -1131,8 +1133,8 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
      // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+    const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
+                            || runScheduleWork->simulationWork.havePpDomainDecomposition;
      const int atomStart = 0;
      fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
                                                              nbv->getNumAtoms(AtomLocality::Local),
@@ -1168,7 +1170,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
      {
          // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
-        if (!havePPDomainDecomposition(cr))
+        if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
          {
              const bool useGpuForceBufferOps = true;
              fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
@@ -1182,7 +1184,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                  cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (runScheduleWork->simulationWork.havePpDomainDecomposition)
      {
          // (re-)initialize non-local GPU force reduction
          const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
@@ -1302,6 +1304,11 @@ void do_force(FILE*                               fplog,
                                                  AtomLocality::Local, simulationWork, stepWork)
                                          : nullptr;
  
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
      // virial, or if host-side data will be transferred from this task
@@ -1310,17 +1317,9 @@ void do_force(FILE*                               fplog,
      // hence copy is not needed.
      const bool haveHostPmePpComms =
              !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
          && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
+            || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
          haveCopiedXFromGpu = true;
@@ -1336,7 +1335,7 @@ void do_force(FILE*                               fplog,
          {
              // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
-                             getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)));
+                             getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
              if (stepWork.haveGpuPmeOnThisRank)
              {
                  // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
@@ -1464,7 +1463,7 @@ void do_force(FILE*                               fplog,
          // Need to run after the GPU-offload bonded interaction lists
          // are set up to be able to determine whether there is bonded work.
          runScheduleWork->domainWork = setupDomainLifetimeWorkload(
-                inputrec, *fr, pull_work, ed, *mdatoms, havePPDomainDecomposition(cr), simulationWork, stepWork);
+                inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
  
          wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
          wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
@@ -1523,7 +1522,7 @@ void do_force(FILE*                               fplog,
  
          // bonded work not split into separate local and non-local, so with DD
          // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+        if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
          {
              fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
          }
@@ -1550,7 +1549,7 @@ void do_force(FILE*                               fplog,
  
      /* Communicate coordinates and sum dipole if necessary +
         do non-local pair search */
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          if (stepWork.doNeighborSearch)
          {
@@ -1646,7 +1645,7 @@ void do_force(FILE*                               fplog,
          wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
          wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
  
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
          }
@@ -1672,7 +1671,7 @@ void do_force(FILE*                               fplog,
      {
          const bool needCoordsOnHost  = (runScheduleWork->domainWork.haveCpuLocalForceWork
                                         || stepWork.computeVirial || simulationWork.computeMuTot);
-        const bool haveAlreadyWaited = haveHostHaloExchangeComms;
+        const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange;
          if (needCoordsOnHost && !haveAlreadyWaited)
          {
              GMX_ASSERT(haveCopiedXFromGpu,
@@ -1736,7 +1735,7 @@ void do_force(FILE*                               fplog,
       * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
       */
      ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
-            &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle);
+            &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
  
      // Force output for MTS combined forces, only set at level1 MTS steps
      std::optional<ForceOutputs> forceOutMts =
@@ -1745,7 +1744,7 @@ void do_force(FILE*                               fplog,
                                                        forceView->forceMtsCombinedWithPadding(),
                                                        domainWork,
                                                        stepWork,
-                                                      havePPDomainDecomposition(cr),
+                                                      simulationWork.havePpDomainDecomposition,
                                                        wcycle))
                      : std::nullopt;
  
@@ -1807,7 +1806,7 @@ void do_force(FILE*                               fplog,
                  stepWork,
                  nrnb);
  
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              nbv->dispatchFreeEnergyKernel(
                      InteractionLocality::NonLocal,
@@ -1838,7 +1837,7 @@ void do_force(FILE*                               fplog,
  
      if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
      {
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
          }
@@ -2008,7 +2007,7 @@ void do_force(FILE*                               fplog,
                           ed,
                           stepWork.doNeighborSearch);
  
-    if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo
+    if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
          && domainWork.haveCpuLocalForceWork)
      {
          stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
@@ -2026,7 +2025,7 @@ void do_force(FILE*                               fplog,
          auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
  
          /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              if (simulationWork.useGpuNonbonded)
              {
@@ -2082,13 +2081,13 @@ void do_force(FILE*                               fplog,
       */
      if (stepWork.combineMtsForcesBeforeHaloExchange)
      {
-        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)),
+        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
                           force.unpaddedArrayRef(),
                           forceView->forceMtsCombined(),
                           inputrec.mtsLevels[1].stepFactor);
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          /* We are done with the CPU compute.
           * We will now communicate the non-local forces.
@@ -2215,8 +2214,8 @@ void do_force(FILE*                               fplog,
  
      // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && simulationWork.useGpuPmePpCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+        && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
@@ -2255,7 +2254,8 @@ void do_force(FILE*                               fplog,
                  // CPU force H2D with GPU force tasks on all streams including those in the
                  // local stream which would otherwise be implicit dependencies for the
                  // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+                auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
+                                                                         : AtomLocality::All;
  
                  stateGpu->copyForcesToGpu(forceWithShift, locality);
              }