Add separate PME rank SimluationWorkload flags

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)

committer Andrey Alekseenko <al42and@gmail.com>

Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)
committer Andrey Alekseenko <al42and@gmail.com>
Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index bda10e7649fe2ccb76e040c05473478e1a853c55..51777faf13d2795c4405df87ccc0e81dd24140ef 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1147,19 +1147,19 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
  
      if (runScheduleWork->simulationWork.useGpuPme
-        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+        && (!runScheduleWork->simulationWork.haveSeparatePmeRank
+            || runScheduleWork->simulationWork.useGpuPmePpCommunication))
      {
          DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+                runScheduleWork->simulationWork.haveSeparatePmeRank
+                        ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
+                        : pme_gpu_get_device_f(fr->pmedata);        // PME force buffer on same GPU
          fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
  
          GpuEventSynchronizer* const pmeSynchronizer =
-                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                               : // PME force buffer on same GPU
-                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-
+                (runScheduleWork->simulationWork.haveSeparatePmeRank
+                         ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU
+                         : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU
          if (GMX_THREAD_MPI)
          {
              GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
@@ -1315,11 +1315,10 @@ void do_force(FILE*                               fplog,
      // to a remote task for halo exchange or PME-PP communication. At
      // search steps the current coordinates are already on the host,
      // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
          && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot))
+            || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
+            || simulationWork.computeMuTot))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
          haveCopiedXFromGpu = true;
@@ -1352,7 +1351,7 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    if (!thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces)
+    if (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
      {
          /* Send particle coordinates to the pme nodes */
          if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
@@ -1709,7 +1708,7 @@ void do_force(FILE*                               fplog,
      /* Reset energies */
      reset_enerdata(enerd);
  
-    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
+    if (DOMAINDECOMP(cr) && simulationWork.haveSeparatePmeRank)
      {
          wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
          dd_force_flop_start(cr->dd, nrnb);
@@ -2214,7 +2213,7 @@ void do_force(FILE*                               fplog,
  
      // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useGpuPmePpCommunication
          && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
@@ -2272,7 +2271,8 @@ void do_force(FILE*                               fplog,
              // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
              //       they should not be copied in do_md(...) for the output.
              if (!simulationWork.useGpuUpdate
-                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
+                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && simulationWork.useCpuPmePpCommunication)
+                || vsite)
              {
                  stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                  stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
@@ -2308,7 +2308,7 @@ void do_force(FILE*                               fplog,
      }
  
      // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useCpuPmePpCommunication
          && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 2462285bef078b77394308accdff5101000d2387..64ada7d8d8a681ca4794da9735a091ce03a2b9c6 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1453,10 +1453,15 @@ int Mdrunner::mdrunner()
                                                           EI_ENERGY_MINIMIZATION(inputrec->eI));
  
      // Also populates the simulation constant workload description.
+    // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+    // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+    // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+    const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
      runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
                                                                disableNonbondedCalculation,
                                                                devFlags,
                                                                havePPDomainDecomposition(cr),
+                                                              haveSeparatePmeRank,
                                                                useGpuForNonbonded,
                                                                pmeRunMode,
                                                                useGpuForBonded,
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index a00197dc184462ac9ea58b62df4f8cdb2f087c89..1a2106e2b2e5057f188d81a949660c793b00d356 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -187,6 +187,10 @@ public:
      bool useCpuHaloExchange = false;
      //! If domain decomposition halo exchange is performed on GPU.
      bool useGpuHaloExchange = false;
+    //! If separate PME rank(s) are used.
+    bool haveSeparatePmeRank = false;
+    //! If PP-PME communication is done purely on CPU (in CPU-only runs or with staged GPU communication).
+    bool useCpuPmePpCommunication = false;
      //! If direct PP-PME communication between GPU is used.
      bool useGpuPmePpCommunication = false;
      //! If direct GPU-GPU communication is enabled.
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp

index e21c8580ade5f573e58a79e3cb0fa08732ce6de4..f3a3a5cdec19e4a36ee699cfc6bd1323de9804c8 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -56,6 +56,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              const bool        disableNonbondedCalculation,
                                              const DevelopmentFeatureFlags& devFlags,
                                              bool       havePpDomainDecomposition,
+                                            bool       haveSeparatePmeRank,
                                              bool       useGpuForNonbonded,
                                              PmeRunMode pmeRunMode,
                                              bool       useGpuForBonded,
@@ -80,8 +81,18 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
      simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
      simulationWorkload.useCpuHaloExchange        = havePpDomainDecomposition && !useGpuDirectHalo;
      simulationWorkload.useGpuHaloExchange        = useGpuDirectHalo;
+    if (pmeRunMode == PmeRunMode::None)
+    {
+        GMX_RELEASE_ASSERT(!haveSeparatePmeRank, "Can not have separate PME rank(s) without PME.");
+    }
+    simulationWorkload.haveSeparatePmeRank = haveSeparatePmeRank;
      simulationWorkload.useGpuPmePpCommunication =
-            devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+            haveSeparatePmeRank && devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
+    simulationWorkload.useCpuPmePpCommunication =
+            haveSeparatePmeRank && !simulationWorkload.useGpuPmePpCommunication;
+    GMX_RELEASE_ASSERT(!(simulationWorkload.useGpuPmePpCommunication
+                         && simulationWorkload.useCpuPmePpCommunication),
+                       "Cannot do PME-PP communication on both CPU and GPU");
      simulationWorkload.useGpuDirectCommunication =
              devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm;
      simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution(inputrec);
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h

index e56e543d61776c0acc94200584eaaf9a08c3df03..3ea543e562cd061d906d975a591e66c4bf21de83 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -62,6 +62,7 @@ struct DevelopmentFeatureFlags;
   * \param[in] disableNonbondedCalculation  Disable calculation of nonbonded forces
   * \param[in] devFlags           The development feature flags
   * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run.
+ * \param[in] haveSeparatePmeRank Whether separate PME rank(s) are used in this run.
   * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions
   *                               calculations on GPU(s).
   * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
@@ -75,6 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
                                              bool              disableNonbondedCalculation,
                                              const DevelopmentFeatureFlags& devFlags,
                                              bool       havePpDomainDecomposition,
+                                            bool       haveSeparatePmeRank,
                                              bool       useGpuForNonbonded,
                                              PmeRunMode pmeRunMode,
                                              bool       useGpuForBonded,
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)
committer	Andrey Alekseenko <al42and@gmail.com>
	Thu, 19 Aug 2021 13:16:27 +0000 (13:16 +0000)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.h		patch \| blob \| history