Balance event consumption for GPU update code path

[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 1e7479b3046a212d2e0bf806c8fd3a476b68b633..3898e07d3af54303c6059984bfc88b8a3e7a430f 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -346,7 +346,7 @@ void gmx::LegacySimulator::do_md()
                                   top_global,
                                   constr ? constr->numFlexibleConstraints() : 0,
                                   ir->nstcalcenergy,
-                                 DOMAINDECOMP(cr),
+                                 haveDDAtomOrdering(*cr),
                                   useGpuForPme);
  
      {
@@ -364,7 +364,7 @@ void gmx::LegacySimulator::do_md()
                                 ? PinningPolicy::PinnedIfSupported
                                 : PinningPolicy::CannotBePinned);
      const t_mdatoms* md = mdAtoms->mdatoms();
-    if (DOMAINDECOMP(cr))
+    if (haveDDAtomOrdering(*cr))
      {
          // Local state only becomes valid now.
          dd_init_local_state(*cr->dd, state_global, state);
@@ -420,8 +420,8 @@ void gmx::LegacySimulator::do_md()
      // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
      if (useGpuForUpdate)
      {
-        GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
-                                   || constr->numConstraintsTotal() == 0,
+        GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
+                                   || constr == nullptr || constr->numConstraintsTotal() == 0,
                             "Constraints in domain decomposition are only supported with update "
                             "groups if using GPU update.\n");
          GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
@@ -661,13 +661,15 @@ void gmx::LegacySimulator::do_md()
                          shake_vir,
                          total_vir,
                          pres,
-                        gmx::ArrayRef<real>{},
                          &nullSignaller,
                          state->box,
                          &bSumEkinhOld,
                          cglo_flags_iteration,
                          step,
                          &observablesReducer);
+        // Clean up after pre-step use of compute_globals()
+        observablesReducer.markAsReadyToReduce();
+
          if (cglo_flags_iteration & CGLO_STOPCM)
          {
              /* At initialization, do not pass x with acceleration-correction mode
@@ -705,13 +707,14 @@ void gmx::LegacySimulator::do_md()
                          shake_vir,
                          total_vir,
                          pres,
-                        gmx::ArrayRef<real>{},
                          &nullSignaller,
                          state->box,
                          &bSumEkinhOld,
                          cglo_flags & ~CGLO_PRESSURE,
                          step,
                          &observablesReducer);
+        // Clean up after pre-step use of compute_globals()
+        observablesReducer.markAsReadyToReduce();
      }
  
      /* Calculate the initial half step temperature, and save the ekinh_old */
@@ -919,15 +922,15 @@ void gmx::LegacySimulator::do_md()
          do_verbose = mdrunOptions.verbose
                       && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
  
-        if (useGpuForUpdate && !bFirstStep && bNS)
+        // On search steps, when doing the update on the GPU, copy
+        // the coordinates and velocities to the host unless they are
+        // already there (ie on the first step and after replica
+        // exchange).
+        if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
          {
-            // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
              stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-            // Copy coordinate from the GPU when needed at the search step.
-            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
-            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
              stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+            stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
              stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
          }
  
@@ -959,19 +962,21 @@ void gmx::LegacySimulator::do_md()
                  if (correct_box(fplog, step, state->box))
                  {
                      bMasterState = TRUE;
-                    // If update is offloaded, it should be informed about the box size change
-                    if (useGpuForUpdate)
-                    {
-                        integrator->setPbc(PbcType::Xyz, state->box);
-                    }
                  }
              }
-            if (DOMAINDECOMP(cr) && bMasterState)
+            // If update is offloaded, and the box was changed either
+            // above or in a replica exchange on the previous step,
+            // the GPU Update object should be informed
+            if (useGpuForUpdate && (bMasterState || bExchanged))
+            {
+                integrator->setPbc(PbcType::Xyz, state->box);
+            }
+            if (haveDDAtomOrdering(*cr) && bMasterState)
              {
                  dd_collect_state(cr->dd, state, state_global);
              }
  
-            if (DOMAINDECOMP(cr))
+            if (haveDDAtomOrdering(*cr))
              {
                  /* Repartition the domain decomposition */
                  dd_partition_system(fplog,
@@ -1047,7 +1052,6 @@ void gmx::LegacySimulator::do_md()
                              nullptr,
                              nullptr,
                              nullptr,
-                            gmx::ArrayRef<real>{},
                              &nullSignaller,
                              state->box,
                              &bSumEkinhOld,
@@ -1487,7 +1491,8 @@ void gmx::LegacySimulator::do_md()
          {
              if (useGpuForUpdate)
              {
-                if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
+                // On search steps, update handles to device vectors
+                if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
                  {
                      integrator->set(stateGpu->getCoordinates(),
                                      stateGpu->getVelocities(),
@@ -1499,8 +1504,9 @@ void gmx::LegacySimulator::do_md()
                      /* The velocity copy is redundant if we had Center-of-Mass motion removed on
                       * the previous step. We don't check that now. */
                      stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                    if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
-                        && !runScheduleWork->stepWork.useGpuXBufferOps)
+                    if (bExchanged
+                        || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
+                            && !runScheduleWork->stepWork.useGpuXBufferOps))
                      {
                          stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                      }
@@ -1520,27 +1526,17 @@ void gmx::LegacySimulator::do_md()
                           && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
  
                  // This applies Leap-Frog, LINCS and SETTLE in succession
-                integrator->integrate(
-                        stateGpu->getForcesReadyOnDeviceEvent(
-                                AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                        ir->delta_t,
-                        true,
-                        bCalcVir,
-                        shake_vir,
-                        doTemperatureScaling,
-                        ekind->tcstat,
-                        doParrinelloRahman,
-                        ir->nstpcouple * ir->delta_t,
-                        M);
-
-                // Copy velocities D2H after update if:
-                // - Globals are computed this step (includes the energy output steps).
-                // - Temperature is needed for the next step.
-                if (bGStat || needHalfStepKineticEnergy)
-                {
-                    stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
-                    stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-                }
+                integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+                                              runScheduleWork->stepWork, runScheduleWork->simulationWork),
+                                      ir->delta_t,
+                                      true,
+                                      bCalcVir,
+                                      shake_vir,
+                                      doTemperatureScaling,
+                                      ekind->tcstat,
+                                      doParrinelloRahman,
+                                      ir->nstpcouple * ir->delta_t,
+                                      M);
              }
              else
              {
@@ -1623,7 +1619,7 @@ void gmx::LegacySimulator::do_md()
  
              if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
              {
-                updatePrevStepPullCom(pull_work, state);
+                updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
              }
  
              enerd->term[F_DVDL_CONSTR] += dvdl_constr;
@@ -1639,14 +1635,33 @@ void gmx::LegacySimulator::do_md()
              // and when algorithms require it.
              const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
  
-            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+            if (useGpuForUpdate)
              {
-                // Copy coordinates when needed to stop the CM motion.
-                if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
+                const bool coordinatesRequiredForStopCM =
+                        bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+                        && !EI_VV(ir->eI);
+
+                // Copy coordinates when needed to stop the CM motion or for replica exchange
+                if (coordinatesRequiredForStopCM || bDoReplEx)
                  {
                      stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                      stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
                  }
+
+                // Copy velocities back to the host if:
+                // - Globals are computed this step (includes the energy output steps).
+                // - Temperature is needed for the next step.
+                // - This is a replica exchange step (even though we will only need
+                //     the velocities if an exchange succeeds)
+                if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
+                {
+                    stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
+                    stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
+                }
+            }
+
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+            {
                  // Since we're already communicating at this step, we
                  // can propagate intra-simulation signals. Note that
                  // check_nstglobalcomm has the responsibility for
@@ -1657,35 +1672,32 @@ void gmx::LegacySimulator::do_md()
                  bool                doIntraSimSignal = true;
                  SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
  
-                compute_globals(
-                        gstat,
-                        cr,
-                        ir,
-                        fr,
-                        ekind,
-                        makeConstArrayRef(state->x),
-                        makeConstArrayRef(state->v),
-                        state->box,
-                        md,
-                        nrnb,
-                        &vcm,
-                        wcycle,
-                        enerd,
-                        force_vir,
-                        shake_vir,
-                        total_vir,
-                        pres,
-                        (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
-                                                                           : gmx::ArrayRef<real>{},
-                        &signaller,
-                        lastbox,
-                        &bSumEkinhOld,
-                        (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
-                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
-                        step,
-                        &observablesReducer);
+                compute_globals(gstat,
+                                cr,
+                                ir,
+                                fr,
+                                ekind,
+                                makeConstArrayRef(state->x),
+                                makeConstArrayRef(state->v),
+                                state->box,
+                                md,
+                                nrnb,
+                                &vcm,
+                                wcycle,
+                                enerd,
+                                force_vir,
+                                shake_vir,
+                                total_vir,
+                                pres,
+                                &signaller,
+                                lastbox,
+                                &bSumEkinhOld,
+                                (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+                                        | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                        | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                                        | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
+                                step,
+                                &observablesReducer);
                  if (!EI_VV(ir->eI) && bStopCM)
                  {
                      process_and_stopcm_grp(
@@ -1695,6 +1707,8 @@ void gmx::LegacySimulator::do_md()
                      // TODO: The special case of removing CM motion should be dealt more gracefully
                      if (useGpuForUpdate)
                      {
+                        // Issue #3988, #4106.
+                        stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
                          stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                          // Here we block until the H2D copy completes because event sync with the
                          // force kernels that use the coordinates on the next steps is not implemented
@@ -1724,6 +1738,7 @@ void gmx::LegacySimulator::do_md()
              accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
          }
  
+        bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
          update_pcouple_after_coordinates(fplog,
                                           step,
                                           ir,
@@ -1737,7 +1752,7 @@ void gmx::LegacySimulator::do_md()
                                           state,
                                           nrnb,
                                           upd.deform(),
-                                         !useGpuForUpdate);
+                                         scaleCoordinates);
  
          const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
                                                    && do_per_step(step, inputrec->nstpcouple));
@@ -1818,7 +1833,6 @@ void gmx::LegacySimulator::do_md()
                                                   md->tmass,
                                                   enerd,
                                                   ir->fepvals.get(),
-                                                 ir->expandedvals.get(),
                                                   lastbox,
                                                   PTCouplingArrays{ state->boxv,
                                                                     state->nosehoover_xi,
@@ -1914,7 +1928,7 @@ void gmx::LegacySimulator::do_md()
                                               MASTER(cr) && mdrunOptions.verbose,
                                               bRerunMD);
  
-            if (bNeedRepartition && DOMAINDECOMP(cr))
+            if (bNeedRepartition && haveDDAtomOrdering(*cr))
              {
                  dd_collect_state(cr->dd, state, state_global);
              }
@@ -1927,7 +1941,7 @@ void gmx::LegacySimulator::do_md()
              bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
          }
  
-        if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
+        if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
          {
              dd_partition_system(fplog,
                                  mdlog,
@@ -1982,7 +1996,7 @@ void gmx::LegacySimulator::do_md()
          }
  
          cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
-        if (DOMAINDECOMP(cr) && wcycle)
+        if (haveDDAtomOrdering(*cr) && wcycle)
          {
              dd_cycles_add(cr->dd, cycles, ddCyclStep);
          }
@@ -1990,6 +2004,7 @@ void gmx::LegacySimulator::do_md()
          /* increase the MD step number */
          step++;
          step_rel++;
+        observablesReducer.markAsReadyToReduce();
  
  #if GMX_FAHCORE
          if (MASTER(cr))