Move PME-GPU do_force() local bool into stepWorkload

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 2f82ddc6d347f618b519630d6ca72236cbe5596c..2515cc148ca384d759b2dbc24b7267311c509353 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -982,8 +982,9 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
      flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
      flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
                                  && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.useGpuXHalo          = simulationWork.useGpuHaloExchange;
+    flags.useGpuFHalo          = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces;
  
      return flags;
  }
@@ -991,15 +992,12 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
  
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
- * TODO: eliminate \p useGpuPmeOnThisRank when this is
- * incorporated in DomainLifetimeWorkload.
   */
  static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                      gmx::GpuBonded*                   gpuBonded,
                                      gmx_pme_t*                        pmedata,
                                      gmx_enerdata_t*                   enerd,
                                      const gmx::MdrunScheduleWorkload& runScheduleWork,
-                                    bool                              useGpuPmeOnThisRank,
                                      int64_t                           step,
                                      gmx_wallcycle*                    wcycle)
  {
@@ -1022,7 +1020,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
          wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
      {
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
@@ -1239,9 +1237,6 @@ void do_force(FILE*                               fplog,
              legacyFlags, inputrec.mtsLevels, step, simulationWork, thisRankHasDuty(cr, DUTY_PME));
      const StepWorkload& stepWork = runScheduleWork->stepWork;
  
-    const bool useGpuPmeOnThisRank =
-            simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces;
-
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
       * decomposition (and not during the previous step as usual).
@@ -1282,9 +1277,9 @@ void do_force(FILE*                               fplog,
      const bool reinitGpuPmePpComms =
              GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
  
-    auto* localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
+    auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
                                          ? stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                  AtomLocality::Local, simulationWork, stepWork)
+                                                AtomLocality::Local, simulationWork, stepWork)
                                          : nullptr;
  
      // Copy coordinate from the GPU if update is on the GPU and there
@@ -1315,14 +1310,14 @@ void do_force(FILE*                               fplog,
      // The local coordinates can be copied right away.
      // NOTE: Consider moving this copy to right after they are updated and constrained,
      //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
+    if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
      {
          if (stepWork.doNeighborSearch)
          {
              // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
                               cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
-            if (useGpuPmeOnThisRank)
+            if (stepWork.haveGpuPmeOnThisRank)
              {
                  // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
                  pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
@@ -1363,7 +1358,7 @@ void do_force(FILE*                               fplog,
                                   wcycle);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (stepWork.haveGpuPmeOnThisRank)
      {
          launchPmeGpuSpread(fr->pmedata,
                             box,
@@ -1519,7 +1514,7 @@ void do_force(FILE*                               fplog,
          wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (stepWork.haveGpuPmeOnThisRank)
      {
          // In PME GPU and mixed mode we launch FFT / gather after the
          // X copy/transform to allow overlap as well as after the GPU NB
@@ -1582,7 +1577,7 @@ void do_force(FILE*                               fplog,
  
              if (stepWork.useGpuXBufferOps)
              {
-                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
+                if (!stepWork.haveGpuPmeOnThisRank && !stepWork.useGpuXHalo)
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                  }
@@ -2059,7 +2054,7 @@ void do_force(FILE*                               fplog,
      const bool combineMtsForcesBeforeHaloExchange =
              (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
               && (legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0
-             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || useGpuPmeOnThisRank));
+             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || stepWork.haveGpuPmeOnThisRank));
      if (combineMtsForcesBeforeHaloExchange)
      {
          const int numAtoms = havePPDomainDecomposition(cr) ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr;
@@ -2117,8 +2112,9 @@ void do_force(FILE*                               fplog,
  
      // With both nonbonded and PME offloaded a GPU on the same rank, we use
      // an alternating wait/reduction scheme.
-    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-                             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
+    bool alternateGpuWait =
+            (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
+             && simulationWork.useGpuNonbonded && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
      if (alternateGpuWait)
      {
          alternatePmeNbGpuWaitReduce(fr->nbv.get(),
@@ -2131,7 +2127,7 @@ void do_force(FILE*                               fplog,
                                      wcycle);
      }
  
-    if (!alternateGpuWait && useGpuPmeOnThisRank)
+    if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank)
      {
          pme_gpu_wait_and_reduce(fr->pmedata,
                                  stepWork,
@@ -2272,8 +2268,7 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    launchGpuEndOfStepTasks(
-            nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork, useGpuPmeOnThisRank, step, wcycle);
+    launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork, step, wcycle);
  
      if (DOMAINDECOMP(cr))
      {
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index c24ba0724c5ade8e4b5d04594d49246086d0526a..281c1fc6eb7cd4f23cbdce5b8c39f75a7d5fdafd 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -97,6 +97,8 @@ public:
      bool useGpuXHalo = false;
      //! Whether GPU forces halo exchange is active this step
      bool useGpuFHalo = false;
+    //! Whether GPU PME work is compute this step (can be false also on fast steps with MTS)
+    bool haveGpuPmeOnThisRank = false;
  };
  
  /*! \libinternal
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Mon, 10 May 2021 08:51:17 +0000 (08:51 +0000)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history