From 78956ab8cc33987eacc995cf578e8869cbda5e6a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Wed, 9 Oct 2019 02:53:24 +0200 Subject: [PATCH] Move buffer ops / PME F reduction flags into StepWorkload Also moved overrides conditions for when buffer ops can not be offloaded into the DevelopmentFeatureFlags data structure initialization, the initialization of which had to be shifted so this code can be passed the task assigment decision on nonbonded offload. Change-Id: Ib6850bcf306a70bbd9557cf2d5c2b1e39159e566 --- src/gromacs/mdlib/sim_util.cpp | 55 ++++++++++++++--------- src/gromacs/mdrun/runner.cpp | 17 ++++--- src/gromacs/mdtypes/simulation_workload.h | 10 +++++ 3 files changed, 53 insertions(+), 29 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 8386e34139..32c30aab8f 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -613,17 +613,15 @@ static int makePmeFlags(const StepWorkload &stepWork) * \param[in] box The box matrix * \param[in] stepWork Step schedule flags * \param[in] pmeFlags PME flags - * \param[in] useGpuForceReduction True if GPU-based force reduction is active this step * \param[in] wcycle The wallcycle structure */ static inline void launchPmeGpuSpread(gmx_pme_t *pmedata, const matrix box, const StepWorkload &stepWork, int pmeFlags, - bool useGpuForceReduction, gmx_wallcycle_t wcycle) { - pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction); + pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction); pme_gpu_launch_spread(pmedata, wcycle); } @@ -795,11 +793,16 @@ setupDomainLifetimeWorkload(const t_inputrec &inputrec, * * \param[in] legacyFlags Force bitmask flags used to construct the new flags * \param[in] isNonbondedOn Global override, if false forces to turn off all nonbonded calculation. + * \param[in] simulationWork Simulation workload description. + * \param[in] rankHasPmeDuty If this rank computes PME. + * * \returns New Stepworkload description. */ static StepWorkload -setupStepWorkload(const int legacyFlags, - const bool isNonbondedOn) +setupStepWorkload(const int legacyFlags, + const bool isNonbondedOn, + const SimulationWorkload &simulationWork, + const bool rankHasPmeDuty) { StepWorkload flags; flags.stateChanged = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0); @@ -811,6 +814,17 @@ setupStepWorkload(const int legacyFlags, flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0); flags.computeNonbondedForces = ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && isNonbondedOn; flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0); + + if (simulationWork.useGpuBufferOps) + { + GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded"); + } + flags.useGpuXBufferOps = simulationWork.useGpuBufferOps; + // on virial steps the CPU reduction path is taken + // TODO: remove flags.computeEnergy, ref #3128 + flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy); + flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu && rankHasPmeDuty); + return flags; } @@ -911,25 +925,22 @@ void do_force(FILE *fplog, legacyFlags &= ~GMX_FORCE_NONBONDED; } - runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded); - const StepWorkload &stepWork = runScheduleWork->stepWork; - const SimulationWorkload &simulationWork = runScheduleWork->simulationWork; - const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME); - const int pmeFlags = makePmeFlags(stepWork); + + runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded, + simulationWork, thisRankHasDuty(cr, DUTY_PME)); + const StepWorkload &stepWork = runScheduleWork->stepWork; + + + const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME); + const int pmeFlags = makePmeFlags(stepWork); // Switches on whether to use GPU for position and force buffer operations // TODO consider all possible combinations of triggers, and how to combine optimally in each case. - const BufferOpsUseGpu useGpuXBufOps = (simulationWork.useGpuBufferOps && - simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;; + const BufferOpsUseGpu useGpuXBufOps = stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False; // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU - const BufferOpsUseGpu useGpuFBufOps = ((simulationWork.useGpuBufferOps && - simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) && - !(stepWork.computeVirial || stepWork.computeEnergy)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False; - // TODO: move / add this flag to the internal PME GPU data structures - const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) && - useGpuPmeOnThisRank; // only supported if this rank is perfoming PME on the GPU + const BufferOpsUseGpu useGpuFBufOps = stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False; /* At a search step we need to start the first balancing region * somewhere early inside the step after communication during domain @@ -1019,7 +1030,7 @@ void do_force(FILE *fplog, if (useGpuPmeOnThisRank) { - launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle); + launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle); } /* do gridding for pair search */ @@ -1533,7 +1544,7 @@ void do_force(FILE *fplog, } const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True); - const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction; + const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction; // TODO: move this into DomainLifetimeWorkload, including the second part of the condition const bool haveCpuLocalForces = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction || (fr->efep != efepNO)); @@ -1636,7 +1647,7 @@ void do_force(FILE *fplog, { gmx::FixedCapacityVector dependencyList; - if (useGpuPmeFReduction) + if (stepWork.useGpuPmeFReduction) { dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata)); } @@ -1679,7 +1690,7 @@ void do_force(FILE *fplog, stateGpu->getForces(), pme_gpu_get_device_f(fr->pmedata), dependencyList, - useGpuPmeFReduction, haveLocalForceContribInCpuBuffer); + stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer); // This function call synchronizes the local stream nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local); stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index fdc93ac07e..75dc6d6fea 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -194,14 +194,16 @@ struct DevelopmentFeatureFlags * Note that some development features overrides are applied already here: * the GPU communication flags are set to false in non-tMPI and non-CUDA builds. * - * \param[in] mdlog Logger object. - * \returns The object populated with development feature flags. + * \param[in] mdlog Logger object. + * \param[in] useGpuForNonbonded True if the nonbonded task is offloaded in this run. + * \returns The object populated with development feature flags. */ -static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog) +static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog, + const bool useGpuForNonbonded) { DevelopmentFeatureFlags devFlags; - devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); + devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded; devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr); devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA)); devFlags.enableGpuPmePPComm = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA)); @@ -683,9 +685,6 @@ int Mdrunner::mdrunner() gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank)); gmx::MDLogger mdlog(logOwner.logger()); - // report any development features that may be enabled by environment variables - const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog); - // TODO The thread-MPI master rank makes a working // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks // after the threads have been launched. This works because no use @@ -845,6 +844,10 @@ int Mdrunner::mdrunner() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + // Initialize development feature flags that enabled by environment variable + // and report those features that are enabled. + const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded); + // Build restraints. // TODO: hide restraint implementation details from Mdrunner. // There is nothing unique about restraints at this point as far as the diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h index e20bcc9e7c..147199771c 100644 --- a/src/gromacs/mdtypes/simulation_workload.h +++ b/src/gromacs/mdtypes/simulation_workload.h @@ -77,6 +77,16 @@ class StepWorkload bool computeListedForces = false; //! Whether this step DHDL needs to be computed bool computeDhdl = false; + /*! \brief Whether coordinate buffer ops are done on the GPU this step + * \note This technically belongs to DomainLifetimeWorkload but due + * to needing the flag before DomainLifetimeWorkload is built we keep + * it here for now. + */ + bool useGpuXBufferOps = false; + //! Whether force buffer ops are done on the GPU this step + bool useGpuFBufferOps = false; + //! Whether PME forces are reduced with other contributions on the GPU this step + bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too }; /*! \libinternal -- 2.22.0