From: Szilárd Páll Date: Fri, 5 Nov 2021 15:59:01 +0000 (+0000) Subject: Split simulationWork.useGpuBufferOps into separate x and f flags X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=8e4184a2387fb1b36361b07c9c3df40076d0250d Split simulationWork.useGpuBufferOps into separate x and f flags The x two "buffer ops" tasks have entirely different roles and it has been a potential source of confusion to refer to these with a single workload flag. Refs #3915 --- diff --git a/src/gromacs/gpu_utils/device_stream_manager.cpp b/src/gromacs/gpu_utils/device_stream_manager.cpp index c732156942..910ccdd0e2 100644 --- a/src/gromacs/gpu_utils/device_stream_manager.cpp +++ b/src/gromacs/gpu_utils/device_stream_manager.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -111,7 +111,7 @@ DeviceStreamManager::Impl::Impl(const DeviceInformation& deviceInfo, std::make_unique(context_, DeviceStreamPriority::High, useTiming); } // Update stream is used both for coordinates transfers and for GPU update/constraints - if (simulationWork.useGpuPme || simulationWork.useGpuUpdate || simulationWork.useGpuBufferOps) + if (simulationWork.useGpuPme || simulationWork.useGpuUpdate || simulationWork.useGpuXBufferOps) { streams_[DeviceStreamType::UpdateAndConstraints] = std::make_unique(context_, DeviceStreamPriority::Normal, useTiming); diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 89da40d77c..14818a0c4b 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -981,14 +981,14 @@ static StepWorkload setupStepWorkload(const int legacyFlags, && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces); flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0); - if (simulationWork.useGpuBufferOps) + if (simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps) { GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded"); } - flags.useGpuXBufferOps = simulationWork.useGpuBufferOps && !flags.doNeighborSearch; + flags.useGpuXBufferOps = simulationWork.useGpuXBufferOps && !flags.doNeighborSearch; // on virial steps the CPU reduction path is taken - flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial; + flags.useGpuFBufferOps = simulationWork.useGpuFBufferOps && !flags.computeVirial; const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank; flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication); @@ -1339,7 +1339,7 @@ void do_force(FILE* fplog, const bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch); - auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps) + auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps) ? stateGpu->getCoordinatesReadyOnDeviceEvent( AtomLocality::Local, simulationWork, stepWork) : nullptr; @@ -1364,7 +1364,8 @@ void do_force(FILE* fplog, haveCopiedXFromGpu = true; } - if (stepWork.doNeighborSearch && ((stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps))) + if (stepWork.doNeighborSearch + && (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps)) { // TODO refactor this to do_md, after partitioning. stateGpu->reinit(mdatoms->homenr, @@ -1530,12 +1531,12 @@ void do_force(FILE* fplog, wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal); wallcycle_stop(wcycle, WallCycleCounter::NS); - if (simulationWork.useGpuBufferOps) + if (simulationWork.useGpuXBufferOps) { nbv->atomdata_init_copy_x_to_nbat_x_gpu(); } - if (simulationWork.useGpuBufferOps) + if (simulationWork.useGpuFBufferOps) { setupLocalGpuForceReduction(runScheduleWork, fr->nbv.get(), diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 735b5daa06..d982767751 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -338,7 +338,6 @@ void gmx::LegacySimulator::do_md() const auto& simulationWork = runScheduleWork->simulationWork; const bool useGpuForPme = simulationWork.useGpuPme; const bool useGpuForNonbonded = simulationWork.useGpuNonbonded; - const bool useGpuForBufferOps = simulationWork.useGpuBufferOps; const bool useGpuForUpdate = simulationWork.useGpuUpdate; /* Check for polarizable models and flexible constraints */ @@ -360,9 +359,8 @@ void gmx::LegacySimulator::do_md() ObservablesReducer observablesReducer = observablesReducerBuilder->build(); ForceBuffers f(simulationWork.useMts, - ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) - ? PinningPolicy::PinnedIfSupported - : PinningPolicy::CannotBePinned); + (simulationWork.useGpuFBufferOps || useGpuForUpdate) ? PinningPolicy::PinnedIfSupported + : PinningPolicy::CannotBePinned); const t_mdatoms* md = mdAtoms->mdatoms(); if (haveDDAtomOrdering(*cr)) { @@ -431,7 +429,7 @@ void gmx::LegacySimulator::do_md() GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr || constr->numConstraintsTotal() == 0, "SHAKE is not supported with GPU update."); - GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps), + GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuXBufferOps), "Either PME or short-ranged non-bonded interaction tasks must run on " "the GPU to use GPU update.\n"); GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD, @@ -488,7 +486,7 @@ void gmx::LegacySimulator::do_md() integrator->setPbc(PbcType::Xyz, state->box); } - if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) + if (useGpuForPme || simulationWork.useGpuXBufferOps || useGpuForUpdate) { changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported); } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 8fc6d89a72..6e245a2df9 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1996,7 +1996,7 @@ int Mdrunner::mdrunner() makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock); } - if (runScheduleWork.simulationWork.useGpuBufferOps) + if (runScheduleWork.simulationWork.useGpuFBufferOps) { fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique( deviceStreamManager->context(), @@ -2011,7 +2011,7 @@ int Mdrunner::mdrunner() std::unique_ptr stateGpu; if (gpusWereDetected && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME)) - || runScheduleWork.simulationWork.useGpuBufferOps)) + || runScheduleWork.simulationWork.useGpuXBufferOps)) { GpuApiCallBehavior transferKind = (inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator) diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h index 1a2106e2b2..d871be32de 100644 --- a/src/gromacs/mdtypes/simulation_workload.h +++ b/src/gromacs/mdtypes/simulation_workload.h @@ -179,8 +179,10 @@ public: bool useGpuBonded = false; //! If update and constraint solving is performed on GPU. bool useGpuUpdate = false; - //! If buffer operations are performed on GPU. - bool useGpuBufferOps = false; + //! If X buffer operations are performed on GPU. + bool useGpuXBufferOps = false; + //! If F buffer operations are performed on GPU. + bool useGpuFBufferOps = false; //! If PP domain decomposition is active. bool havePpDomainDecomposition = false; //! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication). diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp index fb383e3a25..a6a9804624 100644 --- a/src/gromacs/taskassignment/decidesimulationworkload.cpp +++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp @@ -76,8 +76,15 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec, simulationWorkload.useGpuPmeFft = (pmeRunMode == PmeRunMode::Mixed); simulationWorkload.useGpuBonded = useGpuForBonded; simulationWorkload.useGpuUpdate = useGpuForUpdate; - simulationWorkload.useGpuBufferOps = + simulationWorkload.useGpuXBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate) && !inputrec.useMts; + simulationWorkload.useGpuFBufferOps = + (devFlags.enableGpuBufferOps || useGpuForUpdate) && !inputrec.useMts; + if (simulationWorkload.useGpuXBufferOps || simulationWorkload.useGpuFBufferOps) + { + GMX_ASSERT(simulationWorkload.useGpuNonbonded, + "Can only offload X/F buffer ops if nonbonded computation is also offloaded"); + } simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition; simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo; simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;