From caf637d92d107917a0d985bfaf70f358d0d6dd73 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 16 Oct 2019 13:33:18 +0200
Subject: [PATCH] Remove excessive H2D and D2H copies of velocities when update
 is offloaded

If update is offloaded:

The H2D copy of velocities is done:
1. At the search step after the device buffers were reinitialized.

The D2H copy is done:
1. In the beginning of the step on search steps (to back up before the
   device buffers are reinitialized).
2. In the beginning of the velocity output step.
3. After update when globals are computed.
4. After update when temperature is needed for the next step.

The Local locality is used for the copies when update is offloaded in
anticipation of the multi GPU case.

The REMD simulations are now not supported when update is offloaded.

Change-Id: Ifbb9636cafba8980a4a781d942420c5c2c1bcdfd
---
 src/gromacs/mdrun/md.cpp                      | 54 +++++++++++++++----
 src/gromacs/mdrun/runner.cpp                  |  3 +-
 src/gromacs/taskassignment/decidegpuusage.cpp |  7 ++-
 src/gromacs/taskassignment/decidegpuusage.h   |  4 +-
 4 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 014cb78ab4..b30edb8473 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -327,8 +327,11 @@ void gmx::LegacySimulator::do_md()
     const bool  useGpuForPme       = simulationWork.usePmeGpu;
     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
-    const bool  useGpuForBufferOps   = simulationWork.useGpuBufferOps;
-    const bool  useGpuForUpdate      = simulationWork.useGpuUpdate;
+    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
+    const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
+
+
+    StatePropagatorDataGpu *stateGpu = fr->stateGpu;
 
     if (useGpuForUpdate)
     {
@@ -355,7 +358,7 @@ void gmx::LegacySimulator::do_md()
             GMX_LOG(mdlog.info).asParagraph().
                 appendText("Updating coordinates on the GPU.");
         }
-        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice());
+        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
     }
 
     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
@@ -792,6 +795,19 @@ void gmx::LegacySimulator::do_md()
         do_verbose = mdrunOptions.verbose &&
             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 
+        // Copy velocities from the GPU when needed:
+        // - On search steps to keep copy on host (device buffers are reinitialized).
+        // - When needed for the output.
+        if (useGpuForUpdate && !bFirstStep)
+        {
+            if (bNS || do_per_step(step, ir->nstvout))
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
+        }
+
+
         if (bNS && !(bFirstStep && ir->bContinuation))
         {
             bMasterState = FALSE;
@@ -1219,9 +1235,15 @@ void gmx::LegacySimulator::do_md()
             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
         }
 
+        /* With leap-frog type integrators we compute the kinetic energy
+         * at a whole time step as the average of the half-time step kinetic
+         * energies of two subsequent steps. Therefore we need to compute the
+         * half step kinetic energy also if we need energies at the next step.
+         */
+        const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm));
+
         if (useGpuForUpdate)
         {
-            StatePropagatorDataGpu *stateGpu = fr->stateGpu;
             if (bNS)
             {
                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
@@ -1229,25 +1251,35 @@ void gmx::LegacySimulator::do_md()
                 t_pbc pbc;
                 set_pbc(&pbc, epbcXYZ, state->box);
                 integrator->setPbc(&pbc);
+
+                // Copy data to the GPU after buffers might have being reinitialized
+                stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
             }
 
             stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
 
-            bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-            bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
-
             // TODO: Use StepWorkload fields.
             bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
 
+            bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
+            bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
+
             // This applies Leap-Frog, LINCS and SETTLE in succession
             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(StatePropagatorDataGpu::AtomLocality::Local, useGpuFBufferOps),
                                   ir->delta_t, true, bCalcVir, shake_vir,
                                   doTempCouple, ekind->tcstat,
-                                  doPressureCouple, ir->nstpcouple*ir->delta_t, M);
+                                  doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
             stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+
+            // Copy velocities D2H after update if:
+            // - Globals are computed this step (includes the energy output steps).
+            // - Temperature is needed for the next step.
+            if (bGStat || needHalfStepKineticEnergy)
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
 
             // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
             integrator->waitCoordinatesReadyOnDevice();
@@ -1357,7 +1389,7 @@ void gmx::LegacySimulator::do_md()
             // and when algorithms require it.
             bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
 
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
             {
                 // Since we're already communicating at this step, we
                 // can propagate intra-simulation signals. Note that
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 719d300d07..843f401c28 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1547,7 +1547,8 @@ int Mdrunner::mdrunner()
                                                          *mdAtoms,
                                                          doEssentialDynamics,
                                                          fcd->orires.nr != 0,
-                                                         fcd->disres.nsystems != 0);
+                                                         fcd->disres.nsystems != 0,
+                                                         replExParams.exchangeInterval > 0);
 
         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
                     false,
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 67063ff858..fbf0df5c32 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -501,7 +501,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                     const MDAtoms    &mdatoms,
                                     bool              useEssentialDynamics,
                                     bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints)
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange)
 {
     if (updateTarget == TaskTarget::Cpu)
     {
@@ -565,6 +566,10 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
     {
         errorMessage += "Free energy perturbations are not supported.\n";
     }
+    if (useReplicaExchange)
+    {
+        errorMessage += "Replica exchange simulations are not supported.\n";
+    }
     if (!errorMessage.empty())
     {
         if (updateTarget == TaskTarget::Gpu)
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index 48ecf2e88f..2afa571b20 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -242,6 +242,7 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
  * \param[in]  useEssentialDynamics      If essential dynamics is active.
  * \param[in]  doOrientationRestraints   If orientation restraints are enabled.
  * \param[in]  doDistanceRestraints      If distance restraints are enabled.
+ * \param[in]  useReplicaExchange        If this is a REMD simulation.
  *
  * \returns    Whether complete simulation can be run on GPU.
  * \throws     std::bad_alloc            If out of memory
@@ -257,7 +258,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                     const MDAtoms    &mdatoms,
                                     bool              useEssentialDynamics,
                                     bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints);
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange);
 
 
 }  // namespace gmx
-- 
2.22.0