From 817bebaea92727ec0ca9e9308b403ff10725e7fb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 12 Aug 2021 18:57:42 +0200 Subject: [PATCH] Add PP decomposition simluationWorkload flags Add the complete set of flags that indicate whether there is PP decomposition using CPU/GPU communication. These flags allow simplifications in the force schedule. The additional benefit is a reduced reliance on passing around commrec for checks related to parallelization. Refs #3913 --- src/gromacs/mdlib/sim_util.cpp | 78 +++++++++---------- src/gromacs/mdrun/runner.cpp | 1 + src/gromacs/mdtypes/simulation_workload.h | 4 + .../decidesimulationworkload.cpp | 15 ++-- .../taskassignment/decidesimulationworkload.h | 14 ++-- 5 files changed, 61 insertions(+), 51 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 8c4c86a52d..bda10e7649 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -900,12 +900,11 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceH /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute. */ -static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec, - const t_forcerec& fr, - const pull_t* pull_work, - const gmx_edsam* ed, - const t_mdatoms& mdatoms, - bool havePPDomainDecomposition, +static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec, + const t_forcerec& fr, + const pull_t* pull_work, + const gmx_edsam* ed, + const t_mdatoms& mdatoms, const SimulationWorkload& simulationWork, const StepWorkload& stepWork) { @@ -938,7 +937,7 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inpu || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0; domainWork.haveLocalForceContribInCpuBuffer = - domainWork.haveCpuLocalForceWork || havePPDomainDecomposition; + domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition; domainWork.haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork; @@ -989,8 +988,11 @@ static StepWorkload setupStepWorkload(const int legacyFlags, flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial; flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication); - flags.useGpuXHalo = simulationWork.useGpuHaloExchange; - flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps; + flags.useGpuXHalo = simulationWork.useGpuHaloExchange; + flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps; + // Note that rankHasPmeDuty is used confusingly due to the way cr->duty is set up (can be true even for non-PME runs), + // but the haveGpuPmeOnThisRank still ends up correct as simulationWork.useGpuPme == false in such cases. + // TODO: improve this when duty-reliance is eliminated flags.haveGpuPmeOnThisRank = simulationWork.useGpuPme && rankHasPmeDuty && flags.computeSlowForces; flags.combineMtsForcesBeforeHaloExchange = (flags.computeForces && simulationWork.useMts && flags.computeSlowForces @@ -1131,8 +1133,8 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu; // (re-)initialize local GPU force reduction - const bool accumulate = - runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr); + const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork + || runScheduleWork->simulationWork.havePpDomainDecomposition; const int atomStart = 0; fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), @@ -1168,7 +1170,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange) { // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed - if (!havePPDomainDecomposition(cr)) + if (!runScheduleWork->simulationWork.havePpDomainDecomposition) { const bool useGpuForceBufferOps = true; fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( @@ -1182,7 +1184,7 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent()); } - if (havePPDomainDecomposition(cr)) + if (runScheduleWork->simulationWork.havePpDomainDecomposition) { // (re-)initialize non-local GPU force reduction const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork @@ -1302,6 +1304,11 @@ void do_force(FILE* fplog, AtomLocality::Local, simulationWork, stepWork) : nullptr; + GMX_ASSERT(simulationWork.useGpuHaloExchange + == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())), + "The GPU halo exchange is active, but it has not been constructed."); + + bool gmx_used_in_debug haveCopiedXFromGpu = false; // Copy coordinate from the GPU if update is on the GPU and there // are forces to be computed on the CPU, or for the computation of // virial, or if host-side data will be transferred from this task @@ -1310,17 +1317,9 @@ void do_force(FILE* fplog, // hence copy is not needed. const bool haveHostPmePpComms = !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; - - GMX_ASSERT(simulationWork.useGpuHaloExchange - == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())), - "The GPU halo exchange is active, but it has not been constructed."); - const bool haveHostHaloExchangeComms = - havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange; - - bool gmx_used_in_debug haveCopiedXFromGpu = false; if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial - || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot)) + || haveHostPmePpComms || simulationWork.useCpuHaloExchange || simulationWork.computeMuTot)) { stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local); haveCopiedXFromGpu = true; @@ -1336,7 +1335,7 @@ void do_force(FILE* fplog, { // TODO refactor this to do_md, after partitioning. stateGpu->reinit(mdatoms->homenr, - getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr))); + getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition)); if (stepWork.haveGpuPmeOnThisRank) { // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) ) @@ -1464,7 +1463,7 @@ void do_force(FILE* fplog, // Need to run after the GPU-offload bonded interaction lists // are set up to be able to determine whether there is bonded work. runScheduleWork->domainWork = setupDomainLifetimeWorkload( - inputrec, *fr, pull_work, ed, *mdatoms, havePPDomainDecomposition(cr), simulationWork, stepWork); + inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork); wallcycle_start_nocount(wcycle, WallCycleCounter::NS); wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal); @@ -1523,7 +1522,7 @@ void do_force(FILE* fplog, // bonded work not split into separate local and non-local, so with DD // we can only launch the kernel after non-local coordinates have been received. - if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr)) + if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition) { fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork); } @@ -1550,7 +1549,7 @@ void do_force(FILE* fplog, /* Communicate coordinates and sum dipole if necessary + do non-local pair search */ - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { if (stepWork.doNeighborSearch) { @@ -1646,7 +1645,7 @@ void do_force(FILE* fplog, wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal); } @@ -1672,7 +1671,7 @@ void do_force(FILE* fplog, { const bool needCoordsOnHost = (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial || simulationWork.computeMuTot); - const bool haveAlreadyWaited = haveHostHaloExchangeComms; + const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange; if (needCoordsOnHost && !haveAlreadyWaited) { GMX_ASSERT(haveCopiedXFromGpu, @@ -1736,7 +1735,7 @@ void do_force(FILE* fplog, * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps. */ ForceOutputs forceOutMtsLevel0 = setupForceOutputs( - &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle); + &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle); // Force output for MTS combined forces, only set at level1 MTS steps std::optional forceOutMts = @@ -1745,7 +1744,7 @@ void do_force(FILE* fplog, forceView->forceMtsCombinedWithPadding(), domainWork, stepWork, - havePPDomainDecomposition(cr), + simulationWork.havePpDomainDecomposition, wcycle)) : std::nullopt; @@ -1807,7 +1806,7 @@ void do_force(FILE* fplog, stepWork, nrnb); - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { nbv->dispatchFreeEnergyKernel( InteractionLocality::NonLocal, @@ -1838,7 +1837,7 @@ void do_force(FILE* fplog, if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb) { - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); } @@ -2008,7 +2007,7 @@ void do_force(FILE* fplog, ed, stepWork.doNeighborSearch); - if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo + if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo && domainWork.haveCpuLocalForceWork) { stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local); @@ -2026,7 +2025,7 @@ void do_force(FILE* fplog, auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces(); /* wait for non-local forces (or calculate in emulation mode) */ - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { if (simulationWork.useGpuNonbonded) { @@ -2082,13 +2081,13 @@ void do_force(FILE* fplog, */ if (stepWork.combineMtsForcesBeforeHaloExchange) { - combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, havePPDomainDecomposition(cr)), + combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition), force.unpaddedArrayRef(), forceView->forceMtsCombined(), inputrec.mtsLevels[1].stepFactor); } - if (havePPDomainDecomposition(cr)) + if (simulationWork.havePpDomainDecomposition) { /* We are done with the CPU compute. * We will now communicate the non-local forces. @@ -2215,8 +2214,8 @@ void do_force(FILE* fplog, // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops // TODO refactor this and unify with below default-path call to the same function - if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces - && simulationWork.useGpuPmePpCommunication) + if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication + && stepWork.computeSlowForces) { /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. @@ -2255,7 +2254,8 @@ void do_force(FILE* fplog, // CPU force H2D with GPU force tasks on all streams including those in the // local stream which would otherwise be implicit dependencies for the // transfer and would not overlap. - auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All; + auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local + : AtomLocality::All; stateGpu->copyForcesToGpu(forceWithShift, locality); } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index ec71586d46..e3eb7ff8f2 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1443,6 +1443,7 @@ int Mdrunner::mdrunner() runScheduleWork.simulationWork = createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags, + havePPDomainDecomposition(cr), useGpuForNonbonded, pmeRunMode, useGpuForBonded, diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h index 0d7ffe8225..256d8366b8 100644 --- a/src/gromacs/mdtypes/simulation_workload.h +++ b/src/gromacs/mdtypes/simulation_workload.h @@ -180,6 +180,10 @@ public: bool useGpuUpdate = false; //! If buffer operations are performed on GPU. bool useGpuBufferOps = false; + //! If PP domain decomposition is active. + bool havePpDomainDecomposition = false; + //! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication). + bool useCpuHaloExchange = false; //! If domain decomposition halo exchange is performed on GPU. bool useGpuHaloExchange = false; //! If direct PP-PME communication between GPU is used. diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp index bd0c5a55a6..e21c8580ad 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.cpp +++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp @@ -55,11 +55,12 @@ namespace gmx SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, const bool disableNonbondedCalculation, const DevelopmentFeatureFlags& devFlags, - bool useGpuForNonbonded, - PmeRunMode pmeRunMode, - bool useGpuForBonded, - bool useGpuForUpdate, - bool useGpuDirectHalo) + bool havePpDomainDecomposition, + bool useGpuForNonbonded, + PmeRunMode pmeRunMode, + bool useGpuForBonded, + bool useGpuForUpdate, + bool useGpuDirectHalo) { SimulationWorkload simulationWorkload; simulationWorkload.computeNonbonded = !disableNonbondedCalculation; @@ -76,7 +77,9 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, simulationWorkload.useGpuUpdate = useGpuForUpdate; simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate) && !simulationWorkload.computeNonbondedAtMtsLevel1; - simulationWorkload.useGpuHaloExchange = useGpuDirectHalo; + simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition; + simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo; + simulationWorkload.useGpuHaloExchange = useGpuDirectHalo; simulationWorkload.useGpuPmePpCommunication = devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU); simulationWorkload.useGpuDirectCommunication = diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h index c18f9a04d1..e56e543d61 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.h +++ b/src/gromacs/taskassignment/decidesimulationworkload.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -61,6 +61,7 @@ struct DevelopmentFeatureFlags; * \param[in] inputrec The input record * \param[in] disableNonbondedCalculation Disable calculation of nonbonded forces * \param[in] devFlags The development feature flags + * \param[in] havePpDomainDecomposition Whether PP domain decomposition is used in this run. * \param[in] useGpuForNonbonded Whether we have short-range nonbonded interactions * calculations on GPU(s). * \param[in] pmeRunMode Run mode indicating what resource is PME execured on. @@ -73,11 +74,12 @@ struct DevelopmentFeatureFlags; SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, bool disableNonbondedCalculation, const DevelopmentFeatureFlags& devFlags, - bool useGpuForNonbonded, - PmeRunMode pmeRunMode, - bool useGpuForBonded, - bool useGpuForUpdate, - bool useGpuDirectHalo); + bool havePpDomainDecomposition, + bool useGpuForNonbonded, + PmeRunMode pmeRunMode, + bool useGpuForBonded, + bool useGpuForUpdate, + bool useGpuDirectHalo); } // namespace gmx -- 2.22.0