From d41b5ee390b9bf44a68e7d038cc38d8c5a263996 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 19 Aug 2021 13:16:27 +0000 Subject: [PATCH] Add separate PME rank SimluationWorkload flags Added the complete set of flags that indicate whether separate PME ranks are used and what type of PP-PME communication is used. These flags allow simplifications in the force schedule as well as reducing the use of the confusingly defined cr->duty for now in do_force(). The additional benefit is a reduced reliance on passing around commrec for checks related to parallelization. Refs #3913 --- src/gromacs/mdlib/sim_util.cpp | 32 +++++++++---------- src/gromacs/mdrun/runner.cpp | 5 +++ src/gromacs/mdtypes/simulation_workload.h | 4 +++ .../decidesimulationworkload.cpp | 13 +++++++- .../taskassignment/decidesimulationworkload.h | 2 ++ 5 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index bda10e7649..51777faf13 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1147,19 +1147,19 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv)); if (runScheduleWork->simulationWork.useGpuPme - && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication)) + && (!runScheduleWork->simulationWork.haveSeparatePmeRank + || runScheduleWork->simulationWork.useGpuPmePpCommunication)) { DeviceBuffer forcePtr = - thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) - : // PME force buffer on same GPU - fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU + runScheduleWork->simulationWork.haveSeparatePmeRank + ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU + : pme_gpu_get_device_f(fr->pmedata); // PME force buffer on same GPU fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr); GpuEventSynchronizer* const pmeSynchronizer = - (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata) - : // PME force buffer on same GPU - fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU - + (runScheduleWork->simulationWork.haveSeparatePmeRank + ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU + : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU if (GMX_THREAD_MPI) { GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL"); @@ -1315,11 +1315,10 @@ void do_force(FILE* fplog, // to a remote task for halo exchange or PME-PP communication. At // search steps the current coordinates are already on the host, // hence copy is not needed. - const bool haveHostPmePpComms = - !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial - || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot)) + || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange + || simulationWork.computeMuTot)) { stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local); haveCopiedXFromGpu = true; @@ -1352,7 +1351,7 @@ void do_force(FILE* fplog, } } - if (!thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces) + if (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces) { /* Send particle coordinates to the pme nodes */ if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate) @@ -1709,7 +1708,7 @@ void do_force(FILE* fplog, /* Reset energies */ reset_enerdata(enerd); - if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME)) + if (DOMAINDECOMP(cr) && simulationWork.haveSeparatePmeRank) { wallcycle_start(wcycle, WallCycleCounter::PpDuringPme); dd_force_flop_start(cr->dd, nrnb); @@ -2214,7 +2213,7 @@ void do_force(FILE* fplog, // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops // TODO refactor this and unify with below default-path call to the same function - if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication + if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useGpuPmePpCommunication && stepWork.computeSlowForces) { /* In case of node-splitting, the PP nodes receive the long-range @@ -2272,7 +2271,8 @@ void do_force(FILE* fplog, // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence, // they should not be copied in do_md(...) for the output. if (!simulationWork.useGpuUpdate - || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite) + || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && simulationWork.useCpuPmePpCommunication) + || vsite) { stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local); stateGpu->waitForcesReadyOnHost(AtomLocality::Local); @@ -2308,7 +2308,7 @@ void do_force(FILE* fplog, } // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function - if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication + if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useCpuPmePpCommunication && stepWork.computeSlowForces) { /* In case of node-splitting, the PP nodes receive the long-range diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 2462285bef..64ada7d8d8 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1453,10 +1453,15 @@ int Mdrunner::mdrunner() EI_ENERGY_MINIMIZATION(inputrec->eI)); // Also populates the simulation constant workload description. + // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME, + // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used, + // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype). + const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME)); runScheduleWork.simulationWork = createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags, havePPDomainDecomposition(cr), + haveSeparatePmeRank, useGpuForNonbonded, pmeRunMode, useGpuForBonded, diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h index a00197dc18..1a2106e2b2 100644 --- a/src/gromacs/mdtypes/simulation_workload.h +++ b/src/gromacs/mdtypes/simulation_workload.h @@ -187,6 +187,10 @@ public: bool useCpuHaloExchange = false; //! If domain decomposition halo exchange is performed on GPU. bool useGpuHaloExchange = false; + //! If separate PME rank(s) are used. + bool haveSeparatePmeRank = false; + //! If PP-PME communication is done purely on CPU (in CPU-only runs or with staged GPU communication). + bool useCpuPmePpCommunication = false; //! If direct PP-PME communication between GPU is used. bool useGpuPmePpCommunication = false; //! If direct GPU-GPU communication is enabled. diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp index e21c8580ad..f3a3a5cdec 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.cpp +++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp @@ -56,6 +56,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, const bool disableNonbondedCalculation, const DevelopmentFeatureFlags& devFlags, bool havePpDomainDecomposition, + bool haveSeparatePmeRank, bool useGpuForNonbonded, PmeRunMode pmeRunMode, bool useGpuForBonded, @@ -80,8 +81,18 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition; simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo; simulationWorkload.useGpuHaloExchange = useGpuDirectHalo; + if (pmeRunMode == PmeRunMode::None) + { + GMX_RELEASE_ASSERT(!haveSeparatePmeRank, "Can not have separate PME rank(s) without PME."); + } + simulationWorkload.haveSeparatePmeRank = haveSeparatePmeRank; simulationWorkload.useGpuPmePpCommunication = - devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU); + haveSeparatePmeRank && devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU); + simulationWorkload.useCpuPmePpCommunication = + haveSeparatePmeRank && !simulationWorkload.useGpuPmePpCommunication; + GMX_RELEASE_ASSERT(!(simulationWorkload.useGpuPmePpCommunication + && simulationWorkload.useCpuPmePpCommunication), + "Cannot do PME-PP communication on both CPU and GPU"); simulationWorkload.useGpuDirectCommunication = devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm; simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution(inputrec); diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h index e56e543d61..3ea543e562 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.h +++ b/src/gromacs/taskassignment/decidesimulationworkload.h @@ -62,6 +62,7 @@ struct DevelopmentFeatureFlags; * \param[in] disableNonbondedCalculation Disable calculation of nonbonded forces * \param[in] devFlags The development feature flags * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run. + * \param[in] haveSeparatePmeRank Whether separate PME rank(s) are used in this run. * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions * calculations on GPU(s). * \param[in] pmeRunMode Run mode indicating what resource is PME execured on. @@ -75,6 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, bool disableNonbondedCalculation, const DevelopmentFeatureFlags& devFlags, bool havePpDomainDecomposition, + bool haveSeparatePmeRank, bool useGpuForNonbonded, PmeRunMode pmeRunMode, bool useGpuForBonded, -- 2.22.0