Remove excessive H2D and D2H copies of velocities when update is offloaded

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 16 Oct 2019 11:33:18 +0000 (13:33 +0200)

committer Berk Hess <hess@kth.se>

Fri, 18 Oct 2019 09:14:08 +0000 (11:14 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 16 Oct 2019 11:33:18 +0000 (13:33 +0200)
committer Berk Hess <hess@kth.se>
Fri, 18 Oct 2019 09:14:08 +0000 (11:14 +0200)
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 014cb78ab4ec0ae29b08a44ad05cbf15125abf29..b30edb8473766ee18524e6a8d2454df817c1b289 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -327,8 +327,11 @@ void gmx::LegacySimulator::do_md()
      const bool  useGpuForPme       = simulationWork.usePmeGpu;
      const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
      // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
-    const bool  useGpuForBufferOps   = simulationWork.useGpuBufferOps;
-    const bool  useGpuForUpdate      = simulationWork.useGpuUpdate;
+    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
+    const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
+
+
+    StatePropagatorDataGpu *stateGpu = fr->stateGpu;
  
      if (useGpuForUpdate)
      {
@@ -355,7 +358,7 @@ void gmx::LegacySimulator::do_md()
              GMX_LOG(mdlog.info).asParagraph().
                  appendText("Updating coordinates on the GPU.");
          }
-        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice());
+        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
      }
  
      if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
@@ -792,6 +795,19 @@ void gmx::LegacySimulator::do_md()
          do_verbose = mdrunOptions.verbose &&
              (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
  
+        // Copy velocities from the GPU when needed:
+        // - On search steps to keep copy on host (device buffers are reinitialized).
+        // - When needed for the output.
+        if (useGpuForUpdate && !bFirstStep)
+        {
+            if (bNS || do_per_step(step, ir->nstvout))
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
+        }
+
+
          if (bNS && !(bFirstStep && ir->bContinuation))
          {
              bMasterState = FALSE;
@@ -1219,9 +1235,15 @@ void gmx::LegacySimulator::do_md()
              std::copy(state->x.begin(), state->x.end(), cbuf.begin());
          }
  
+        /* With leap-frog type integrators we compute the kinetic energy
+         * at a whole time step as the average of the half-time step kinetic
+         * energies of two subsequent steps. Therefore we need to compute the
+         * half step kinetic energy also if we need energies at the next step.
+         */
+        const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm));
+
          if (useGpuForUpdate)
          {
-            StatePropagatorDataGpu *stateGpu = fr->stateGpu;
              if (bNS)
              {
                  integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
@@ -1229,25 +1251,35 @@ void gmx::LegacySimulator::do_md()
                  t_pbc pbc;
                  set_pbc(&pbc, epbcXYZ, state->box);
                  integrator->setPbc(&pbc);
+
+                // Copy data to the GPU after buffers might have being reinitialized
+                stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
              }
  
              stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
              stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
  
-            bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-            bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
-
              // TODO: Use StepWorkload fields.
              bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
  
+            bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
+            bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
+
              // This applies Leap-Frog, LINCS and SETTLE in succession
              integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(StatePropagatorDataGpu::AtomLocality::Local, useGpuFBufferOps),
                                    ir->delta_t, true, bCalcVir, shake_vir,
                                    doTempCouple, ekind->tcstat,
-                                  doPressureCouple, ir->nstpcouple*ir->delta_t, M);
+                                  doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
              stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
-            stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+
+            // Copy velocities D2H after update if:
+            // - Globals are computed this step (includes the energy output steps).
+            // - Temperature is needed for the next step.
+            if (bGStat || needHalfStepKineticEnergy)
+            {
+                stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
  
              // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
              integrator->waitCoordinatesReadyOnDevice();
@@ -1357,7 +1389,7 @@ void gmx::LegacySimulator::do_md()
              // and when algorithms require it.
              bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
  
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
              {
                  // Since we're already communicating at this step, we
                  // can propagate intra-simulation signals. Note that
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 719d300d073384807ed71a874e631cb55a76578e..843f401c282c37b165c4ef8e5cadf502519712e9 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1547,7 +1547,8 @@ int Mdrunner::mdrunner()
                                                           *mdAtoms,
                                                           doEssentialDynamics,
                                                           fcd->orires.nr != 0,
-                                                         fcd->disres.nsystems != 0);
+                                                         fcd->disres.nsystems != 0,
+                                                         replExParams.exchangeInterval > 0);
  
          const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
                      false,
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp

index 67063ff85807e6e33489342d4d0c8351a0edb18c..fbf0df5c322a22b14a7d7260d71424cc449fcfcd 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -501,7 +501,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                      const MDAtoms    &mdatoms,
                                      bool              useEssentialDynamics,
                                      bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints)
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange)
  {
      if (updateTarget == TaskTarget::Cpu)
      {
@@ -565,6 +566,10 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
      {
          errorMessage += "Free energy perturbations are not supported.\n";
      }
+    if (useReplicaExchange)
+    {
+        errorMessage += "Replica exchange simulations are not supported.\n";
+    }
      if (!errorMessage.empty())
      {
          if (updateTarget == TaskTarget::Gpu)
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h

index 48ecf2e88f33102bd286809b9259a048589f8711..2afa571b2029d5ba045f938bd2754f11f5ce87b0 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -242,6 +242,7 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
   * \param[in]  useEssentialDynamics      If essential dynamics is active.
   * \param[in]  doOrientationRestraints   If orientation restraints are enabled.
   * \param[in]  doDistanceRestraints      If distance restraints are enabled.
+ * \param[in]  useReplicaExchange        If this is a REMD simulation.
   *
   * \returns    Whether complete simulation can be run on GPU.
   * \throws     std::bad_alloc            If out of memory
@@ -257,7 +258,8 @@ bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
                                      const MDAtoms    &mdatoms,
                                      bool              useEssentialDynamics,
                                      bool              doOrientationRestraints,
-                                    bool              doDistanceRestraints);
+                                    bool              doDistanceRestraints,
+                                    bool              useReplicaExchange);
  
  
  }  // namespace gmx
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 16 Oct 2019 11:33:18 +0000 (13:33 +0200)
committer	Berk Hess <hess@kth.se>
	Fri, 18 Oct 2019 09:14:08 +0000 (11:14 +0200)
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.h		patch \| blob \| history