Remove excessive H2D and D2H copies of velocities when update is offloaded
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 16 Oct 2019 11:33:18 +0000 (13:33 +0200)
committerBerk Hess <hess@kth.se>
Fri, 18 Oct 2019 09:14:08 +0000 (11:14 +0200)
If update is offloaded:

The H2D copy of velocities is done:
1. At the search step after the device buffers were reinitialized.

The D2H copy is done:
1. In the beginning of the step on search steps (to back up before the
   device buffers are reinitialized).
2. In the beginning of the velocity output step.
3. After update when globals are computed.
4. After update when temperature is needed for the next step.

The Local locality is used for the copies when update is offloaded in
anticipation of the multi GPU case.

The REMD simulations are now not supported when update is offloaded.

Change-Id: Ifbb9636cafba8980a4a781d942420c5c2c1bcdfd

src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h

index 014cb78ab4ec0ae29b08a44ad05cbf15125abf29..b30edb8473766ee18524e6a8d2454df817c1b289 100644 (file)
@@ -327,8 +327,11 @@ void gmx::LegacySimulator::do_md()
     const bool  useGpuForPme       = simulationWork.usePmeGpu;
     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
-    const bool  useGpuForBufferOps   = simulationWork.useGpuBufferOps;
-    const bool  useGpuForUpdate      = simulationWork.useGpuUpdate;
+    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
+    const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
+
+
+    StatePropagatorDataGpu *stateGpu = fr->stateGpu;
 
     if (useGpuForUpdate)
     {
@@ -355,7 +358,7 @@ void gmx::LegacySimulator::do_md()
             GMX_LOG(mdlog.info).asParagraph().
                 appendText("Updating coordinates on the GPU.");
         }
-        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice());
+        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
     }
 
     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
@@ -792,6 +795,19 @@ void gmx::LegacySimulator::do_md()
         do_verbose = mdrunOptions.verbose &&
             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 
+        // Copy velocities from the GPU when needed:
+        // - On search steps to keep copy on host (device buffers are reinitialized).
+        // - When needed for the output.
+        if (useGpuForUpdate && !bFirstStep)
+        {
+            if (bNS || do_per_step(step, ir->nstvout))
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
+        }
+
+
         if (bNS && !(bFirstStep && ir->bContinuation))
         {
             bMasterState = FALSE;
@@ -1219,9 +1235,15 @@ void gmx::LegacySimulator::do_md()
             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
         }
 
+        /* With leap-frog type integrators we compute the kinetic energy
+         * at a whole time step as the average of the half-time step kinetic
+         * energies of two subsequent steps. Therefore we need to compute the
+         * half step kinetic energy also if we need energies at the next step.
+         */
+        const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm));
+
         if (useGpuForUpdate)
         {
-            StatePropagatorDataGpu *stateGpu = fr->stateGpu;
             if (bNS)
             {
                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
@@ -1229,25 +1251,35 @@ void gmx::LegacySimulator::do_md()
                 t_pbc pbc;
                 set_pbc(&pbc, epbcXYZ, state->box);
                 integrator->setPbc(&pbc);
+
+                // Copy data to the GPU after buffers might have being reinitialized
+                stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
             }
 
             stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
 
-            bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-            bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
-
             // TODO: Use StepWorkload fields.
             bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
 
+            bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
+            bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
+
             // This applies Leap-Frog, LINCS and SETTLE in succession
             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(StatePropagatorDataGpu::AtomLocality::Local, useGpuFBufferOps),
                                   ir->delta_t, true, bCalcVir, shake_vir,
                                   doTempCouple, ekind->tcstat,
-                                  doPressureCouple, ir->nstpcouple*ir->delta_t, M);
+                                  doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
             stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+
+            // Copy velocities D2H after update if:
+            // - Globals are computed this step (includes the energy output steps).
+            // - Temperature is needed for the next step.
+            if (bGStat || needHalfStepKineticEnergy)
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
 
             // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
             integrator->waitCoordinatesReadyOnDevice();
@@ -1357,7 +1389,7 @@ void gmx::LegacySimulator::do_md()
             // and when algorithms require it.
             bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
 
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
             {
                 // Since we're already communicating at this step, we
                 // can propagate intra-simulation signals. Note that
index 719d300d073384807ed71a874e631cb55a76578e..843f401c282c37b165c4ef8e5cadf502519712e9 100644 (file)
@@ -1547,7 +1547,8 @@ int Mdrunner::mdrunner()
                                                          *mdAtoms,
                                                          doEssentialDynamics,
                                                          fcd->orires.nr != 0,
-                                                         fcd->disres.nsystems != 0);
+                                                         fcd->disres.nsystems != 0,
+                                                         replExParams.exchangeInterval > 0);
 
         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
                     false,
index 67063ff85807e6e33489342d4d0c8351a0edb18c..fbf0df5c322a22b14a7d7260d71424cc449fcfcd 100644 (file)
@@ -501,7 +501,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                     const MDAtoms    &mdatoms,
                                     bool              useEssentialDynamics,
                                     bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints)
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange)
 {
     if (updateTarget == TaskTarget::Cpu)
     {
@@ -565,6 +566,10 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
     {
         errorMessage += "Free energy perturbations are not supported.\n";
     }
+    if (useReplicaExchange)
+    {
+        errorMessage += "Replica exchange simulations are not supported.\n";
+    }
     if (!errorMessage.empty())
     {
         if (updateTarget == TaskTarget::Gpu)
index 48ecf2e88f33102bd286809b9259a048589f8711..2afa571b2029d5ba045f938bd2754f11f5ce87b0 100644 (file)
@@ -242,6 +242,7 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
  * \param[in]  useEssentialDynamics      If essential dynamics is active.
  * \param[in]  doOrientationRestraints   If orientation restraints are enabled.
  * \param[in]  doDistanceRestraints      If distance restraints are enabled.
+ * \param[in]  useReplicaExchange        If this is a REMD simulation.
  *
  * \returns    Whether complete simulation can be run on GPU.
  * \throws     std::bad_alloc            If out of memory
@@ -257,7 +258,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                     const MDAtoms    &mdatoms,
                                     bool              useEssentialDynamics,
                                     bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints);
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange);
 
 
 }  // namespace gmx