From ed556fe1766fa4263c0315b548886dd7d497ff40 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 2 Oct 2020 07:02:31 +0000 Subject: [PATCH] Use workload data structures for GPU halo exchange triggers Move GPU halo exchange trigger booleans and related conditionals into workload data structures, and remove unnecessary assertion on GPU buffer ops being active (since it is now automatically activated when GPU halo exchange is active). Partly addresses #3370 --- src/gromacs/mdlib/sim_util.cpp | 48 +++++++++---------- src/gromacs/mdrun/md.cpp | 2 +- src/gromacs/mdrun/runner.cpp | 11 +++-- src/gromacs/mdtypes/simulation_workload.h | 4 ++ src/gromacs/taskassignment/decidegpuusage.cpp | 11 +++++ src/gromacs/taskassignment/decidegpuusage.h | 18 +++++++ .../decidesimulationworkload.cpp | 5 +- .../taskassignment/decidesimulationworkload.h | 4 +- 8 files changed, 70 insertions(+), 33 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 2af9a25280..ef27166978 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -911,6 +911,8 @@ static StepWorkload setupStepWorkload(const int legacyFlags, flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial; flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication); + flags.useGpuXHalo = simulationWork.useGpuHaloExchange; + flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps; return flags; } @@ -1037,12 +1039,10 @@ static void combineMtsForces(const int numAtoms, * \param [in] runScheduleWork Schedule workload flag structure * \param [in] cr Communication record object * \param [in] fr Force record object - * \param [in] ddUsesGpuDirectCommunication Whether GPU direct communication is in use */ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, const t_commrec* cr, - t_forcerec* fr, - bool ddUsesGpuDirectCommunication) + t_forcerec* fr) { nonbonded_verlet_t* nbv = fr->nbv.get(); @@ -1075,13 +1075,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, } if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr)) - && !ddUsesGpuDirectCommunication) + && !runScheduleWork->simulationWork.useGpuHaloExchange) { fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true)); } - if (ddUsesGpuDirectCommunication) + if (runScheduleWork->simulationWork.useGpuHaloExchange) { fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent()); @@ -1246,17 +1246,6 @@ void do_force(FILE* fplog, } } - // TODO Update this comment when introducing SimulationWorkload - // - // The conditions for gpuHaloExchange e.g. using GPU buffer - // operations were checked before construction, so here we can - // just use it and assert upon any conditions. - const bool ddUsesGpuDirectCommunication = - ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())); - GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps, - "Must use coordinate buffer ops with GPU halo exchange"); - const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps; - // Copy coordinate from the GPU if update is on the GPU and there // are forces to be computed on the CPU, or for the computation of // virial, or if host-side data will be transferred from this task @@ -1265,7 +1254,12 @@ void do_force(FILE* fplog, // hence copy is not needed. const bool haveHostPmePpComms = !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; - const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication; + + GMX_ASSERT(simulationWork.useGpuHaloExchange + == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())), + "The GPU halo exchange is active, but it has not been constructed."); + const bool haveHostHaloExchangeComms = + havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange; bool gmx_used_in_debug haveCopiedXFromGpu = false; if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch @@ -1383,7 +1377,7 @@ void do_force(FILE* fplog, if (simulationWork.useGpuBufferOps) { - setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication); + setupGpuForceReductions(runScheduleWork, cr, fr); } } else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces) @@ -1466,14 +1460,14 @@ void do_force(FILE* fplog, // to location in do_md where GPU halo exchange is // constructed at partitioning, after above stateGpu // re-initialization has similarly been refactored - if (ddUsesGpuDirectCommunication) + if (simulationWork.useGpuHaloExchange) { reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces()); } } else { - if (ddUsesGpuDirectCommunication) + if (stepWork.useGpuXHalo) { // The following must be called after local setCoordinates (which records an event // when the coordinate data has been copied to the device). @@ -1496,7 +1490,7 @@ void do_force(FILE* fplog, if (stepWork.useGpuXBufferOps) { - if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication) + if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo) { stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal); } @@ -1706,7 +1700,7 @@ void do_force(FILE* fplog, } // TODO Force flags should include haveFreeEnergyWork for this domain - if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)) + if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)) { /* Wait for non-local coordinate data to be copied from device */ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal); @@ -1792,7 +1786,7 @@ void do_force(FILE* fplog, GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps), "The schedule below does not allow for nonbonded MTS with GPU buffer ops"); - GMX_ASSERT(!(nonbondedAtMtsLevel1 && useGpuForcesHaloExchange), + GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo), "The schedule below does not allow for nonbonded MTS with GPU halo exchange"); // Will store the amount of cycles spent waiting for the GPU that // will be later used in the DLB accounting. @@ -1832,9 +1826,10 @@ void do_force(FILE* fplog, AtomLocality::NonLocal); } + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute(); - if (!useGpuForcesHaloExchange) + if (!stepWork.useGpuFHalo) { // copy from GPU input for dd_move_f() stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(), @@ -1878,7 +1873,8 @@ void do_force(FILE* fplog, if (stepWork.computeForces) { - if (useGpuForcesHaloExchange) + + if (stepWork.useGpuFHalo) { if (domainWork.haveCpuLocalForceWork) { @@ -2004,7 +2000,7 @@ void do_force(FILE* fplog, // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result // of the halo exchange. In that case the copy is instead performed above, before the exchange. // These should be unified. - if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange) + if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo) { // Note: AtomLocality::All is used for the non-DD case because, as in this // case copyForcesToGpu() uses a separate stream, it allows overlap of diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index e2f5581f3a..bf7aa914be 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -850,7 +850,7 @@ void gmx::LegacySimulator::do_md() } // Allocate or re-size GPU halo exchange object, if necessary - if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded) + if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange) { GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr, "GPU device manager has to be initialized to use GPU " diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index d32550e182..b22881a05d 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1259,10 +1259,15 @@ int Mdrunner::mdrunner() } MdrunScheduleWorkload runScheduleWork; + + bool useGpuDirectHalo = decideWhetherToUseGpuForHalo( + devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator, + doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI)); + // Also populates the simulation constant workload description. - runScheduleWork.simulationWork = - createSimulationWorkload(*inputrec, disableNonbondedCalculation, devFlags, - useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate); + runScheduleWork.simulationWork = createSimulationWorkload( + *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode, + useGpuForBonded, useGpuForUpdate, useGpuDirectHalo); std::unique_ptr deviceStreamManager = nullptr; diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h index f2d08c5847..c24ba0724c 100644 --- a/src/gromacs/mdtypes/simulation_workload.h +++ b/src/gromacs/mdtypes/simulation_workload.h @@ -93,6 +93,10 @@ public: bool useGpuFBufferOps = false; //! Whether PME forces are reduced with other contributions on the GPU this step bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too + //! Whether GPU coordinates halo exchange is active this step + bool useGpuXHalo = false; + //! Whether GPU forces halo exchange is active this step + bool useGpuFHalo = false; }; /*! \libinternal diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 85dfe60e93..be1657af8e 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -714,4 +714,15 @@ bool decideWhetherToUseGpuForUpdate(const bool isDomainDecom || (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault)); } +bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags, + bool havePPDomainDecomposition, + bool useGpuForNonbonded, + bool useModularSimulator, + bool doRerun, + bool haveEnergyMinimization) +{ + return havePPDomainDecomposition && devFlags.enableGpuHaloExchange && useGpuForNonbonded + && !useModularSimulator && !doRerun && !haveEnergyMinimization; +} + } // namespace gmx diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index 5d15244416..b151c5d6ea 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -301,6 +301,24 @@ bool decideWhetherToUseGpuForUpdate(bool isDomainDecom const gmx::MDLogger& mdlog); +/*! \brief Decide whether to use GPU for halo exchange. + * + * \param[in] devFlags GPU development / experimental feature flags. + * \param[in] havePPDomainDecomposition Whether PP domain decomposition is in use. + * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. + * \param[in] useModularSimulator Whether modularsimulator is in use. + * \param[in] doRerun Whether this is a rerun. + * \param[in] haveEnergyMinimization Whether energy minimization is in use. + * + * \returns Whether halo exchange can be run on GPU. + */ +bool decideWhetherToUseGpuForHalo(const DevelopmentFeatureFlags& devFlags, + bool havePPDomainDecomposition, + bool useGpuForNonbonded, + bool useModularSimulator, + bool doRerun, + bool haveEnergyMinimization); + } // namespace gmx #endif diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp index 06b1eeb29d..5c5cdaeb3f 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.cpp +++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp @@ -58,7 +58,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, bool useGpuForNonbonded, PmeRunMode pmeRunMode, bool useGpuForBonded, - bool useGpuForUpdate) + bool useGpuForUpdate, + bool useGpuDirectHalo) { SimulationWorkload simulationWorkload; simulationWorkload.computeNonbonded = !disableNonbondedCalculation; @@ -75,7 +76,7 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, simulationWorkload.useGpuUpdate = useGpuForUpdate; simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate) && !simulationWorkload.computeNonbondedAtMtsLevel1; - simulationWorkload.useGpuHaloExchange = devFlags.enableGpuHaloExchange; + simulationWorkload.useGpuHaloExchange = useGpuDirectHalo; simulationWorkload.useGpuPmePpCommunication = devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU); simulationWorkload.useGpuDirectCommunication = diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h index 5cde42e10c..c18f9a04d1 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.h +++ b/src/gromacs/taskassignment/decidesimulationworkload.h @@ -67,6 +67,7 @@ struct DevelopmentFeatureFlags; * \param[in] useGpuForBonded Whether bonded interactions are calculated on GPU(s). * \param[in] useGpuForUpdate Whether coordinate update and constraint solving is performed on * GPU(s). + * \param[in] useGpuDirectHalo Whether halo exchange is performed directly between GPUs. * \returns Simulation lifetime constant workload description. */ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, @@ -75,7 +76,8 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, bool useGpuForNonbonded, PmeRunMode pmeRunMode, bool useGpuForBonded, - bool useGpuForUpdate); + bool useGpuForUpdate, + bool useGpuDirectHalo); } // namespace gmx -- 2.22.0