From caf637d92d107917a0d985bfaf70f358d0d6dd73 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Wed, 16 Oct 2019 13:33:18 +0200 Subject: [PATCH] Remove excessive H2D and D2H copies of velocities when update is offloaded If update is offloaded: The H2D copy of velocities is done: 1. At the search step after the device buffers were reinitialized. The D2H copy is done: 1. In the beginning of the step on search steps (to back up before the device buffers are reinitialized). 2. In the beginning of the velocity output step. 3. After update when globals are computed. 4. After update when temperature is needed for the next step. The Local locality is used for the copies when update is offloaded in anticipation of the multi GPU case. The REMD simulations are now not supported when update is offloaded. Change-Id: Ifbb9636cafba8980a4a781d942420c5c2c1bcdfd --- src/gromacs/mdrun/md.cpp | 54 +++++++++++++++---- src/gromacs/mdrun/runner.cpp | 3 +- src/gromacs/taskassignment/decidegpuusage.cpp | 7 ++- src/gromacs/taskassignment/decidegpuusage.h | 4 +- 4 files changed, 54 insertions(+), 14 deletions(-) diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 014cb78ab4..b30edb8473 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -327,8 +327,11 @@ void gmx::LegacySimulator::do_md() const bool useGpuForPme = simulationWork.usePmeGpu; const bool useGpuForNonbonded = simulationWork.useGpuNonbonded; // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded - const bool useGpuForBufferOps = simulationWork.useGpuBufferOps; - const bool useGpuForUpdate = simulationWork.useGpuUpdate; + const bool useGpuForBufferOps = simulationWork.useGpuBufferOps; + const bool useGpuForUpdate = simulationWork.useGpuUpdate; + + + StatePropagatorDataGpu *stateGpu = fr->stateGpu; if (useGpuForUpdate) { @@ -355,7 +358,7 @@ void gmx::LegacySimulator::do_md() GMX_LOG(mdlog.info).asParagraph(). appendText("Updating coordinates on the GPU."); } - integrator = std::make_unique(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice()); + integrator = std::make_unique(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice()); } if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) @@ -792,6 +795,19 @@ void gmx::LegacySimulator::do_md() do_verbose = mdrunOptions.verbose && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep); + // Copy velocities from the GPU when needed: + // - On search steps to keep copy on host (device buffers are reinitialized). + // - When needed for the output. + if (useGpuForUpdate && !bFirstStep) + { + if (bNS || do_per_step(step, ir->nstvout)) + { + stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); + } + } + + if (bNS && !(bFirstStep && ir->bContinuation)) { bMasterState = FALSE; @@ -1219,9 +1235,15 @@ void gmx::LegacySimulator::do_md() std::copy(state->x.begin(), state->x.end(), cbuf.begin()); } + /* With leap-frog type integrators we compute the kinetic energy + * at a whole time step as the average of the half-time step kinetic + * energies of two subsequent steps. Therefore we need to compute the + * half step kinetic energy also if we need energies at the next step. + */ + const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)); + if (useGpuForUpdate) { - StatePropagatorDataGpu *stateGpu = fr->stateGpu; if (bNS) { integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(), @@ -1229,25 +1251,35 @@ void gmx::LegacySimulator::do_md() t_pbc pbc; set_pbc(&pbc, epbcXYZ, state->box); integrator->setPbc(&pbc); + + // Copy data to the GPU after buffers might have being reinitialized + stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); } stateGpu->copyCoordinatesToGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); - stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All); stateGpu->copyForcesToGpu(ArrayRef(f), StatePropagatorDataGpu::AtomLocality::All); - bool doTempCouple = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple)); - bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple)); - // TODO: Use StepWorkload fields. bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner); + bool doTempCouple = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple)); + bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple)); + // This applies Leap-Frog, LINCS and SETTLE in succession integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(StatePropagatorDataGpu::AtomLocality::Local, useGpuFBufferOps), ir->delta_t, true, bCalcVir, shake_vir, doTempCouple, ekind->tcstat, - doPressureCouple, ir->nstpcouple*ir->delta_t, M); + doParrinelloRahman, ir->nstpcouple*ir->delta_t, M); stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); - stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All); + + // Copy velocities D2H after update if: + // - Globals are computed this step (includes the energy output steps). + // - Temperature is needed for the next step. + if (bGStat || needHalfStepKineticEnergy) + { + stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); + } // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...) integrator->waitCoordinatesReadyOnDevice(); @@ -1357,7 +1389,7 @@ void gmx::LegacySimulator::do_md() // and when algorithms require it. bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm)); - if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal) + if (bGStat || needHalfStepKineticEnergy || doInterSimSignal) { // Since we're already communicating at this step, we // can propagate intra-simulation signals. Note that diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 719d300d07..843f401c28 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1547,7 +1547,8 @@ int Mdrunner::mdrunner() *mdAtoms, doEssentialDynamics, fcd->orires.nr != 0, - fcd->disres.nsystems != 0); + fcd->disres.nsystems != 0, + replExParams.exchangeInterval > 0); const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible( false, diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 67063ff858..fbf0df5c32 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -501,7 +501,8 @@ bool decideWhetherToUseGpuForUpdate(bool isDomainDecomposition, const MDAtoms &mdatoms, bool useEssentialDynamics, bool doOrientationRestraints, - bool doDistanceRestraints) + bool doDistanceRestraints, + bool useReplicaExchange) { if (updateTarget == TaskTarget::Cpu) { @@ -565,6 +566,10 @@ bool decideWhetherToUseGpuForUpdate(bool isDomainDecomposition, { errorMessage += "Free energy perturbations are not supported.\n"; } + if (useReplicaExchange) + { + errorMessage += "Replica exchange simulations are not supported.\n"; + } if (!errorMessage.empty()) { if (updateTarget == TaskTarget::Gpu) diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index 48ecf2e88f..2afa571b20 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -242,6 +242,7 @@ bool decideWhetherToUseGpusForBonded(bool useGpuForNonbonded, * \param[in] useEssentialDynamics If essential dynamics is active. * \param[in] doOrientationRestraints If orientation restraints are enabled. * \param[in] doDistanceRestraints If distance restraints are enabled. + * \param[in] useReplicaExchange If this is a REMD simulation. * * \returns Whether complete simulation can be run on GPU. * \throws std::bad_alloc If out of memory @@ -257,7 +258,8 @@ bool decideWhetherToUseGpuForUpdate(bool isDomainDecomposition, const MDAtoms &mdatoms, bool useEssentialDynamics, bool doOrientationRestraints, - bool doDistanceRestraints); + bool doDistanceRestraints, + bool useReplicaExchange); } // namespace gmx -- 2.22.0