The x two "buffer ops" tasks have entirely different roles and it has
been a potential source of confusion to refer to these with a single
workload flag.
Refs #3915
/*
* This file is part of the GROMACS molecular simulation package.
*
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
std::make_unique<DeviceStream>(context_, DeviceStreamPriority::High, useTiming);
}
// Update stream is used both for coordinates transfers and for GPU update/constraints
std::make_unique<DeviceStream>(context_, DeviceStreamPriority::High, useTiming);
}
// Update stream is used both for coordinates transfers and for GPU update/constraints
- if (simulationWork.useGpuPme || simulationWork.useGpuUpdate || simulationWork.useGpuBufferOps)
+ if (simulationWork.useGpuPme || simulationWork.useGpuUpdate || simulationWork.useGpuXBufferOps)
{
streams_[DeviceStreamType::UpdateAndConstraints] =
std::make_unique<DeviceStream>(context_, DeviceStreamPriority::Normal, useTiming);
{
streams_[DeviceStreamType::UpdateAndConstraints] =
std::make_unique<DeviceStream>(context_, DeviceStreamPriority::Normal, useTiming);
&& !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
&& !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
- if (simulationWork.useGpuBufferOps)
+ if (simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps)
{
GMX_ASSERT(simulationWork.useGpuNonbonded,
"Can only offload buffer ops if nonbonded computation is also offloaded");
}
{
GMX_ASSERT(simulationWork.useGpuNonbonded,
"Can only offload buffer ops if nonbonded computation is also offloaded");
}
- flags.useGpuXBufferOps = simulationWork.useGpuBufferOps && !flags.doNeighborSearch;
+ flags.useGpuXBufferOps = simulationWork.useGpuXBufferOps && !flags.doNeighborSearch;
// on virial steps the CPU reduction path is taken
// on virial steps the CPU reduction path is taken
- flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
+ flags.useGpuFBufferOps = simulationWork.useGpuFBufferOps && !flags.computeVirial;
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
&& (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
&& (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
const bool reinitGpuPmePpComms =
simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
const bool reinitGpuPmePpComms =
simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
- auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
+ auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps)
? stateGpu->getCoordinatesReadyOnDeviceEvent(
AtomLocality::Local, simulationWork, stepWork)
: nullptr;
? stateGpu->getCoordinatesReadyOnDeviceEvent(
AtomLocality::Local, simulationWork, stepWork)
: nullptr;
haveCopiedXFromGpu = true;
}
haveCopiedXFromGpu = true;
}
- if (stepWork.doNeighborSearch && ((stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)))
+ if (stepWork.doNeighborSearch
+ && (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps))
{
// TODO refactor this to do_md, after partitioning.
stateGpu->reinit(mdatoms->homenr,
{
// TODO refactor this to do_md, after partitioning.
stateGpu->reinit(mdatoms->homenr,
wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
wallcycle_stop(wcycle, WallCycleCounter::NS);
wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
wallcycle_stop(wcycle, WallCycleCounter::NS);
- if (simulationWork.useGpuBufferOps)
+ if (simulationWork.useGpuXBufferOps)
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
- if (simulationWork.useGpuBufferOps)
+ if (simulationWork.useGpuFBufferOps)
{
setupLocalGpuForceReduction(runScheduleWork,
fr->nbv.get(),
{
setupLocalGpuForceReduction(runScheduleWork,
fr->nbv.get(),
const auto& simulationWork = runScheduleWork->simulationWork;
const bool useGpuForPme = simulationWork.useGpuPme;
const bool useGpuForNonbonded = simulationWork.useGpuNonbonded;
const auto& simulationWork = runScheduleWork->simulationWork;
const bool useGpuForPme = simulationWork.useGpuPme;
const bool useGpuForNonbonded = simulationWork.useGpuNonbonded;
- const bool useGpuForBufferOps = simulationWork.useGpuBufferOps;
const bool useGpuForUpdate = simulationWork.useGpuUpdate;
/* Check for polarizable models and flexible constraints */
const bool useGpuForUpdate = simulationWork.useGpuUpdate;
/* Check for polarizable models and flexible constraints */
ObservablesReducer observablesReducer = observablesReducerBuilder->build();
ForceBuffers f(simulationWork.useMts,
ObservablesReducer observablesReducer = observablesReducerBuilder->build();
ForceBuffers f(simulationWork.useMts,
- ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
- ? PinningPolicy::PinnedIfSupported
- : PinningPolicy::CannotBePinned);
+ (simulationWork.useGpuFBufferOps || useGpuForUpdate) ? PinningPolicy::PinnedIfSupported
+ : PinningPolicy::CannotBePinned);
const t_mdatoms* md = mdAtoms->mdatoms();
if (haveDDAtomOrdering(*cr))
{
const t_mdatoms* md = mdAtoms->mdatoms();
if (haveDDAtomOrdering(*cr))
{
GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
|| constr->numConstraintsTotal() == 0,
"SHAKE is not supported with GPU update.");
GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
|| constr->numConstraintsTotal() == 0,
"SHAKE is not supported with GPU update.");
- GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
+ GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuXBufferOps),
"Either PME or short-ranged non-bonded interaction tasks must run on "
"the GPU to use GPU update.\n");
GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
"Either PME or short-ranged non-bonded interaction tasks must run on "
"the GPU to use GPU update.\n");
GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
integrator->setPbc(PbcType::Xyz, state->box);
}
integrator->setPbc(PbcType::Xyz, state->box);
}
- if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+ if (useGpuForPme || simulationWork.useGpuXBufferOps || useGpuForUpdate)
{
changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
}
{
changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
}
makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
}
makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
}
- if (runScheduleWork.simulationWork.useGpuBufferOps)
+ if (runScheduleWork.simulationWork.useGpuFBufferOps)
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
&& ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
&& ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
- || runScheduleWork.simulationWork.useGpuBufferOps))
+ || runScheduleWork.simulationWork.useGpuXBufferOps))
{
GpuApiCallBehavior transferKind =
(inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
{
GpuApiCallBehavior transferKind =
(inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
bool useGpuBonded = false;
//! If update and constraint solving is performed on GPU.
bool useGpuUpdate = false;
bool useGpuBonded = false;
//! If update and constraint solving is performed on GPU.
bool useGpuUpdate = false;
- //! If buffer operations are performed on GPU.
- bool useGpuBufferOps = false;
+ //! If X buffer operations are performed on GPU.
+ bool useGpuXBufferOps = false;
+ //! If F buffer operations are performed on GPU.
+ bool useGpuFBufferOps = false;
//! If PP domain decomposition is active.
bool havePpDomainDecomposition = false;
//! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication).
//! If PP domain decomposition is active.
bool havePpDomainDecomposition = false;
//! If domain decomposition halo exchange is performed on CPU (in CPU-only runs or with staged GPU communication).
simulationWorkload.useGpuPmeFft = (pmeRunMode == PmeRunMode::Mixed);
simulationWorkload.useGpuBonded = useGpuForBonded;
simulationWorkload.useGpuUpdate = useGpuForUpdate;
simulationWorkload.useGpuPmeFft = (pmeRunMode == PmeRunMode::Mixed);
simulationWorkload.useGpuBonded = useGpuForBonded;
simulationWorkload.useGpuUpdate = useGpuForUpdate;
- simulationWorkload.useGpuBufferOps =
+ simulationWorkload.useGpuXBufferOps =
(devFlags.enableGpuBufferOps || useGpuForUpdate) && !inputrec.useMts;
(devFlags.enableGpuBufferOps || useGpuForUpdate) && !inputrec.useMts;
+ simulationWorkload.useGpuFBufferOps =
+ (devFlags.enableGpuBufferOps || useGpuForUpdate) && !inputrec.useMts;
+ if (simulationWorkload.useGpuXBufferOps || simulationWorkload.useGpuFBufferOps)
+ {
+ GMX_ASSERT(simulationWorkload.useGpuNonbonded,
+ "Can only offload X/F buffer ops if nonbonded computation is also offloaded");
+ }
simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo;
simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;
simulationWorkload.havePpDomainDecomposition = havePpDomainDecomposition;
simulationWorkload.useCpuHaloExchange = havePpDomainDecomposition && !useGpuDirectHalo;
simulationWorkload.useGpuHaloExchange = useGpuDirectHalo;