Split simulationWork.useGpuBufferOps into separate x and f flags

[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 970bcd6781830a386e2e6e14424c7674b8bee7b9..d982767751b471ddaa13fb05500e19a8aa37202a 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -338,7 +338,6 @@ void gmx::LegacySimulator::do_md()
      const auto& simulationWork     = runScheduleWork->simulationWork;
      const bool  useGpuForPme       = simulationWork.useGpuPme;
      const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
-    const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
      const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
  
      /* Check for polarizable models and flexible constraints */
@@ -360,9 +359,8 @@ void gmx::LegacySimulator::do_md()
      ObservablesReducer observablesReducer = observablesReducerBuilder->build();
  
      ForceBuffers     f(simulationWork.useMts,
-                   ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-                               ? PinningPolicy::PinnedIfSupported
-                               : PinningPolicy::CannotBePinned);
+                   (simulationWork.useGpuFBufferOps || useGpuForUpdate) ? PinningPolicy::PinnedIfSupported
+                                                                            : PinningPolicy::CannotBePinned);
      const t_mdatoms* md = mdAtoms->mdatoms();
      if (haveDDAtomOrdering(*cr))
      {
@@ -395,7 +393,9 @@ void gmx::LegacySimulator::do_md()
                                   md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
                                               : gmx::ArrayRef<const unsigned short>(),
                                   md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
-                                         : gmx::ArrayRef<const unsigned short>());
+                                         : gmx::ArrayRef<const unsigned short>(),
+                                 md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
+                                          : gmx::ArrayRef<const unsigned short>());
          fr->longRangeNonbondeds->updateAfterPartition(*md);
      }
      else
@@ -409,7 +409,9 @@ void gmx::LegacySimulator::do_md()
                                   md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
                                               : gmx::ArrayRef<const unsigned short>(),
                                   md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
-                                         : gmx::ArrayRef<const unsigned short>());
+                                         : gmx::ArrayRef<const unsigned short>(),
+                                 md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
+                                          : gmx::ArrayRef<const unsigned short>());
          fr->longRangeNonbondeds->updateAfterPartition(*md);
      }
  
@@ -427,7 +429,7 @@ void gmx::LegacySimulator::do_md()
          GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
                                     || constr->numConstraintsTotal() == 0,
                             "SHAKE is not supported with GPU update.");
-        GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
+        GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuXBufferOps),
                             "Either PME or short-ranged non-bonded interaction tasks must run on "
                             "the GPU to use GPU update.\n");
          GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
@@ -484,7 +486,7 @@ void gmx::LegacySimulator::do_md()
          integrator->setPbc(PbcType::Xyz, state->box);
      }
  
-    if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+    if (useGpuForPme || simulationWork.useGpuXBufferOps || useGpuForUpdate)
      {
          changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
      }
@@ -922,15 +924,15 @@ void gmx::LegacySimulator::do_md()
          do_verbose = mdrunOptions.verbose
                       && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
  
-        if (useGpuForUpdate && !bFirstStep && bNS)
+        // On search steps, when doing the update on the GPU, copy
+        // the coordinates and velocities to the host unless they are
+        // already there (ie on the first step and after replica
+        // exchange).
+        if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
          {
-            // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
              stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            // Copy coordinate from the GPU when needed at the search step.
-            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
-            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
              stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
              stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
          }
  
@@ -962,13 +964,15 @@ void gmx::LegacySimulator::do_md()
                  if (correct_box(fplog, step, state->box))
                  {
                      bMasterState = TRUE;
-                    // If update is offloaded, it should be informed about the box size change
-                    if (useGpuForUpdate)
-                    {
-                        integrator->setPbc(PbcType::Xyz, state->box);
-                    }
                  }
              }
+            // If update is offloaded, and the box was changed either
+            // above or in a replica exchange on the previous step,
+            // the GPU Update object should be informed
+            if (useGpuForUpdate && (bMasterState || bExchanged))
+            {
+                integrator->setPbc(PbcType::Xyz, state->box);
+            }
              if (haveDDAtomOrdering(*cr) && bMasterState)
              {
                  dd_collect_state(cr->dd, state, state_global);
@@ -1002,7 +1006,9 @@ void gmx::LegacySimulator::do_md()
                                           md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
                                                       : gmx::ArrayRef<const unsigned short>(),
                                           md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
-                                                 : gmx::ArrayRef<const unsigned short>());
+                                                 : gmx::ArrayRef<const unsigned short>(),
+                                         md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
+                                                  : gmx::ArrayRef<const unsigned short>());
                  fr->longRangeNonbondeds->updateAfterPartition(*md);
              }
          }
@@ -1489,7 +1495,8 @@ void gmx::LegacySimulator::do_md()
          {
              if (useGpuForUpdate)
              {
-                if (bNS && (bFirstStep || haveDDAtomOrdering(*cr)))
+                // On search steps, update handles to device vectors
+                if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
                  {
                      integrator->set(stateGpu->getCoordinates(),
                                      stateGpu->getVelocities(),
@@ -1501,8 +1508,9 @@ void gmx::LegacySimulator::do_md()
                      /* The velocity copy is redundant if we had Center-of-Mass motion removed on
                       * the previous step. We don't check that now. */
                      stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                    if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
-                        && !runScheduleWork->stepWork.useGpuXBufferOps)
+                    if (bExchanged
+                        || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
+                            && !runScheduleWork->stepWork.useGpuXBufferOps))
                      {
                          stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                      }
@@ -1522,27 +1530,17 @@ void gmx::LegacySimulator::do_md()
                           && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
  
                  // This applies Leap-Frog, LINCS and SETTLE in succession
-                integrator->integrate(
-                        stateGpu->getForcesReadyOnDeviceEvent(
-                                AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                        ir->delta_t,
-                        true,
-                        bCalcVir,
-                        shake_vir,
-                        doTemperatureScaling,
-                        ekind->tcstat,
-                        doParrinelloRahman,
-                        ir->nstpcouple * ir->delta_t,
-                        M);
-
-                // Copy velocities D2H after update if:
-                // - Globals are computed this step (includes the energy output steps).
-                // - Temperature is needed for the next step.
-                if (bGStat || needHalfStepKineticEnergy)
-                {
-                    stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-                    stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-                }
+                integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+                                              runScheduleWork->stepWork, runScheduleWork->simulationWork),
+                                      ir->delta_t,
+                                      true,
+                                      bCalcVir,
+                                      shake_vir,
+                                      doTemperatureScaling,
+                                      ekind->tcstat,
+                                      doParrinelloRahman,
+                                      ir->nstpcouple * ir->delta_t,
+                                      M);
              }
              else
              {
@@ -1625,7 +1623,7 @@ void gmx::LegacySimulator::do_md()
  
              if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
              {
-                updatePrevStepPullCom(pull_work, state);
+                updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
              }
  
              enerd->term[F_DVDL_CONSTR] += dvdl_constr;
@@ -1641,14 +1639,33 @@ void gmx::LegacySimulator::do_md()
              // and when algorithms require it.
              const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
  
-            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+            if (useGpuForUpdate)
              {
-                // Copy coordinates when needed to stop the CM motion.
-                if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
+                const bool coordinatesRequiredForStopCM =
+                        bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+                        && !EI_VV(ir->eI);
+
+                // Copy coordinates when needed to stop the CM motion or for replica exchange
+                if (coordinatesRequiredForStopCM || bDoReplEx)
                  {
                      stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                      stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
                  }
+
+                // Copy velocities back to the host if:
+                // - Globals are computed this step (includes the energy output steps).
+                // - Temperature is needed for the next step.
+                // - This is a replica exchange step (even though we will only need
+                //     the velocities if an exchange succeeds)
+                if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
+                {
+                    stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
+                    stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
+                }
+            }
+
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+            {
                  // Since we're already communicating at this step, we
                  // can propagate intra-simulation signals. Note that
                  // check_nstglobalcomm has the responsibility for
@@ -1694,6 +1711,8 @@ void gmx::LegacySimulator::do_md()
                      // TODO: The special case of removing CM motion should be dealt more gracefully
                      if (useGpuForUpdate)
                      {
+                        // Issue #3988, #4106.
+                        stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
                          stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                          // Here we block until the H2D copy completes because event sync with the
                          // force kernels that use the coordinates on the next steps is not implemented
@@ -1723,6 +1742,7 @@ void gmx::LegacySimulator::do_md()
              accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
          }
  
+        bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
          update_pcouple_after_coordinates(fplog,
                                           step,
                                           ir,
@@ -1736,7 +1756,7 @@ void gmx::LegacySimulator::do_md()
                                           state,
                                           nrnb,
                                           upd.deform(),
-                                         !useGpuForUpdate);
+                                         scaleCoordinates);
  
          const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
                                                    && do_per_step(step, inputrec->nstpcouple));
@@ -1817,7 +1837,6 @@ void gmx::LegacySimulator::do_md()
                                                   md->tmass,
                                                   enerd,
                                                   ir->fepvals.get(),
-                                                 ir->expandedvals.get(),
                                                   lastbox,
                                                   PTCouplingArrays{ state->boxv,
                                                                     state->nosehoover_xi,
@@ -1953,7 +1972,9 @@ void gmx::LegacySimulator::do_md()
                                       md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
                                                   : gmx::ArrayRef<const unsigned short>(),
                                       md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
-                                             : gmx::ArrayRef<const unsigned short>());
+                                             : gmx::ArrayRef<const unsigned short>(),
+                                     md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
+                                              : gmx::ArrayRef<const unsigned short>());
              fr->longRangeNonbondeds->updateAfterPartition(*md);
          }