Set up workload data structures

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 881b9974ac29d95dcd353a23ad22be4a97aa2acc..4d0b1595ae82c5f7e0825de8df64e13bcd452e29 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -77,7 +77,6 @@
  #include "gromacs/mdlib/force.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/update.h"
  #include "gromacs/mdtypes/commrec.h"
@@ -86,6 +85,7 @@
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
  #include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
@@ -114,6 +114,8 @@
  #include "gromacs/utility/sysinfo.h"
  
  using gmx::ForceOutputs;
+using gmx::StepWorkload;
+using gmx::DomainLifetimeWorkload;
  
  // TODO: this environment variable allows us to verify before release
  // that on less common architectures the total cost of polling is not larger than
@@ -271,7 +273,7 @@ static void post_process_forces(const t_commrec       *cr,
                                  const t_graph         *graph,
                                  const t_forcerec      *fr,
                                  const gmx_vsite_t     *vsite,
-                                const gmx::ForceFlags &forceFlags)
+                                const StepWorkload    &stepWork)
  {
      rvec *f = as_rvec_array(forceOutputs->forceWithShiftForces().force().data());
  
@@ -288,13 +290,13 @@ static void post_process_forces(const t_commrec       *cr,
               */
              matrix virial = { { 0 } };
              spread_vsite_f(vsite, x, fDirectVir, nullptr,
-                           forceFlags.computeVirial, virial,
+                           stepWork.computeVirial, virial,
                             nrnb,
                             &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
              forceWithVirial.addVirialContribution(virial);
          }
  
-        if (forceFlags.computeVirial)
+        if (stepWork.computeVirial)
          {
              /* Now add the forces, this is local */
              sum_forces(f, forceWithVirial.force_);
@@ -319,14 +321,14 @@ static void post_process_forces(const t_commrec       *cr,
  static void do_nb_verlet(t_forcerec                       *fr,
                           const interaction_const_t        *ic,
                           gmx_enerdata_t                   *enerd,
-                         const gmx::ForceFlags            &forceFlags,
+                         const StepWorkload               &stepWork,
                           const Nbnxm::InteractionLocality  ilocality,
                           const int                         clearF,
                           const int64_t                     step,
                           t_nrnb                           *nrnb,
                           gmx_wallcycle_t                   wcycle)
  {
-    if (!forceFlags.computeNonbondedForces)
+    if (!stepWork.computeNonbondedForces)
      {
          /* skip non-bonded calculation */
          return;
@@ -356,7 +358,7 @@ static void do_nb_verlet(t_forcerec                       *fr,
          }
      }
  
-    nbv->dispatchNonbondedKernel(ilocality, *ic, forceFlags, clearF, *fr, enerd, nrnb);
+    nbv->dispatchNonbondedKernel(ilocality, *ic, stepWork, clearF, *fr, enerd, nrnb);
  }
  
  static inline void clear_rvecs_omp(int n, rvec v[])
@@ -514,7 +516,7 @@ haveSpecialForces(const t_inputrec              *inputrec,
   * \param[in]     x                The coordinates
   * \param[in]     mdatoms          Per atom properties
   * \param[in]     lambda           Array of free-energy lambda values
- * \param[in]     forceFlags       Force schedule flags
+ * \param[in]     stepWork         Step schedule flags
   * \param[in,out] forceWithVirial  Force and virial buffers
   * \param[in,out] enerd            Energy buffer
   * \param[in,out] ed               Essential dynamics pointer
@@ -539,7 +541,7 @@ computeSpecialForces(FILE                          *fplog,
                       gmx::ArrayRef<const gmx::RVec> x,
                       const t_mdatoms               *mdatoms,
                       real                          *lambda,
-                     const gmx::ForceFlags         &forceFlags,
+                     const StepWorkload            &stepWork,
                       gmx::ForceWithVirial          *forceWithVirial,
                       gmx_enerdata_t                *enerd,
                       gmx_edsam                     *ed,
@@ -548,7 +550,7 @@ computeSpecialForces(FILE                          *fplog,
      /* NOTE: Currently all ForceProviders only provide forces.
       *       When they also provide energies, remove this conditional.
       */
-    if (forceFlags.computeForces)
+    if (stepWork.computeForces)
      {
          gmx::ForceProviderInput  forceProviderInput(x, *mdatoms, t, box, *cr);
          gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
@@ -594,7 +596,7 @@ computeSpecialForces(FILE                          *fplog,
      }
  
      /* Add forces from interactive molecular dynamics (IMD), if any */
-    if (inputrec->bIMD && forceFlags.computeForces)
+    if (inputrec->bIMD && stepWork.computeForces)
      {
          imdSession->applyForces(f);
      }
@@ -605,20 +607,20 @@ computeSpecialForces(FILE                          *fplog,
   * \param[in]  pmedata              The PME structure
   * \param[in]  box                  The box matrix
   * \param[in]  x                    Coordinate array
- * \param[in]  forceFlags           Force schedule flags
+ * \param[in]  stepWork             Step schedule flags
   * \param[in]  pmeFlags             PME flags
   * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
   * \param[in]  wcycle               The wallcycle structure
   */
-static inline void launchPmeGpuSpread(gmx_pme_t             *pmedata,
-                                      const matrix           box,
-                                      const rvec             x[],
-                                      const gmx::ForceFlags &forceFlags,
-                                      int                    pmeFlags,
-                                      bool                   useGpuForceReduction,
-                                      gmx_wallcycle_t        wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
+                                      const matrix        box,
+                                      const rvec          x[],
+                                      const StepWorkload &stepWork,
+                                      int                 pmeFlags,
+                                      bool                useGpuForceReduction,
+                                      gmx_wallcycle_t     wcycle)
  {
-    pme_gpu_prepare_computation(pmedata, forceFlags.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
      pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle);
      pme_gpu_launch_spread(pmedata, wcycle);
  }
@@ -650,17 +652,17 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
   * \param[in,out] pmedata          PME module data
   * \param[in,out] forceOutputs     Output buffer for the forces and virial
   * \param[in,out] enerd            Energy data structure results are reduced into
- * \param[in]     forceFlags       Force schedule flags
+ * \param[in]     stepWork         Step schedule flags
   * \param[in]     pmeFlags         PME flags
   * \param[in]     wcycle           The wallcycle structure
   */
-static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
-                                        gmx_pme_t                           *pmedata,
-                                        gmx::ForceOutputs                   *forceOutputs,
-                                        gmx_enerdata_t                      *enerd,
-                                        const gmx::ForceFlags               &forceFlags,
-                                        int                                  pmeFlags,
-                                        gmx_wallcycle_t                      wcycle)
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
+                                        gmx_pme_t          *pmedata,
+                                        gmx::ForceOutputs  *forceOutputs,
+                                        gmx_enerdata_t     *enerd,
+                                        const StepWorkload &stepWork,
+                                        int                 pmeFlags,
+                                        gmx_wallcycle_t     wcycle)
  {
      bool isPmeGpuDone = false;
      bool isNbGpuDone  = false;
@@ -684,7 +686,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
              isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
-                                                     forceFlags,
+                                                     stepWork,
                                                       Nbnxm::AtomLocality::Local,
                                                       enerd->grpp.ener[egLJSR].data(),
                                                       enerd->grpp.ener[egCOULSR].data(),
@@ -705,7 +707,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
   * \param[in] pull_work The pull work object.
   * \param[in] inputrec  input record
   * \param[in] force     force array
- * \param[in] forceFlags Force schedule flags
+ * \param[in] stepWork  Step schedule flags
   * \param[out] wcycle   wallcycle recording structure
   *
   * \returns             Cleared force output structure
@@ -715,15 +717,15 @@ setupForceOutputs(t_forcerec                          *fr,
                    pull_t                              *pull_work,
                    const t_inputrec                    &inputrec,
                    gmx::ArrayRefWithPadding<gmx::RVec>  force,
-                  const gmx::ForceFlags               &forceFlags,
+                  const StepWorkload                  &stepWork,
                    gmx_wallcycle_t                      wcycle)
  {
      wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
  
      /* NOTE: We assume fr->shiftForces is all zeros here */
-    gmx::ForceWithShiftForces forceWithShiftForces(force, forceFlags.computeVirial, fr->shiftForces);
+    gmx::ForceWithShiftForces forceWithShiftForces(force, stepWork.computeVirial, fr->shiftForces);
  
-    if (forceFlags.computeForces)
+    if (stepWork.computeForces)
      {
          /* Clear the short- and long-range forces */
          clear_rvecs_omp(fr->natoms_force_constr,
@@ -735,12 +737,12 @@ setupForceOutputs(t_forcerec                          *fr,
       * directly, such as PME. Otherwise, forceWithVirial uses the
       * the same force (f in legacy calls) buffer as other algorithms.
       */
-    const bool useSeparateForceWithVirialBuffer = (forceFlags.computeForces &&
-                                                   (forceFlags.computeVirial && fr->haveDirectVirialContributions));
+    const bool useSeparateForceWithVirialBuffer = (stepWork.computeForces &&
+                                                   (stepWork.computeVirial && fr->haveDirectVirialContributions));
      /* forceWithVirial uses the local atom range only */
      gmx::ForceWithVirial forceWithVirial(useSeparateForceWithVirialBuffer ?
                                           fr->forceBufferForDirectVirialContributions : force.unpaddedArrayRef(),
-                                         forceFlags.computeVirial);
+                                         stepWork.computeVirial);
  
      if (useSeparateForceWithVirialBuffer)
      {
@@ -763,30 +765,23 @@ setupForceOutputs(t_forcerec                          *fr,
  }
  
  
-/*! \brief Set up flags that indicate what type of work is there to compute.
- *
- * Currently we only update it at search steps,
- * but some properties may change more frequently (e.g. virial/non-virial step),
- * so when including those either the frequency of update (per-step) or the scope
- * of a flag will change (i.e. a set of flags for nstlist steps).
- *
+/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
  static void
-setupForceWorkload(gmx::PpForceWorkload  *forceWork,
-                   const t_inputrec      *inputrec,
-                   const t_forcerec      *fr,
-                   const pull_t          *pull_work,
-                   const gmx_edsam       *ed,
-                   const t_idef          &idef,
-                   const t_fcdata        *fcd,
-                   const gmx::ForceFlags &forceFlags
-                   )
+setupDomainLifetimeWorkload(DomainLifetimeWorkload *domainWork,
+                            const t_inputrec       *inputrec,
+                            const t_forcerec       *fr,
+                            const pull_t           *pull_work,
+                            const gmx_edsam        *ed,
+                            const t_idef           &idef,
+                            const t_fcdata         *fcd,
+                            const StepWorkload     &stepWork)
  {
-    forceWork->haveSpecialForces      = haveSpecialForces(inputrec, fr->forceProviders, pull_work, forceFlags.computeForces, ed);
-    forceWork->haveCpuBondedWork      = haveCpuBondeds(*fr);
-    forceWork->haveGpuBondedWork      = ((fr->gpuBonded != nullptr) && fr->gpuBonded->haveInteractions());
-    forceWork->haveRestraintsWork     = havePositionRestraints(idef, *fcd);
-    forceWork->haveCpuListedForceWork = haveCpuListedForces(*fr, idef, *fcd);
+    domainWork->haveSpecialForces      = haveSpecialForces(inputrec, fr->forceProviders, pull_work, stepWork.computeForces, ed);
+    domainWork->haveCpuBondedWork      = haveCpuBondeds(*fr);
+    domainWork->haveGpuBondedWork      = ((fr->gpuBonded != nullptr) && fr->gpuBonded->haveInteractions());
+    domainWork->haveRestraintsWork     = havePositionRestraints(idef, *fcd);
+    domainWork->haveCpuListedForceWork = haveCpuListedForces(*fr, idef, *fcd);
  }
  
  /*! \brief Set up force flag stuct from the force bitmask.
@@ -796,9 +791,9 @@ setupForceWorkload(gmx::PpForceWorkload  *forceWork,
   * \param[in]      isNonbondedOn        Global override, if false forces to turn off all nonbonded calculation.
   */
  static void
-setupForceFlags(gmx::ForceFlags *flags,
-                const int        legacyFlags,
-                const bool       isNonbondedOn)
+setupStepWorkload(StepWorkload *flags,
+                  const int     legacyFlags,
+                  const bool    isNonbondedOn)
  {
      flags->stateChanged           = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
      flags->haveDynamicBox         = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
@@ -815,18 +810,18 @@ setupForceFlags(gmx::ForceFlags *flags,
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
   * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
- * incorporated in PpForceWorkload.
+ * incorporated in DomainLifetimeWorkload.
   */
  static void
-launchGpuEndOfStepTasks(nonbonded_verlet_t            *nbv,
-                        gmx::GpuBonded                *gpuBonded,
-                        gmx_pme_t                     *pmedata,
-                        gmx_enerdata_t                *enerd,
-                        const gmx::MdScheduleWorkload &mdScheduleWork,
-                        bool                           useGpuNonbonded,
-                        bool                           useGpuPme,
-                        int64_t                        step,
-                        gmx_wallcycle_t                wcycle)
+launchGpuEndOfStepTasks(nonbonded_verlet_t               *nbv,
+                        gmx::GpuBonded                   *gpuBonded,
+                        gmx_pme_t                        *pmedata,
+                        gmx_enerdata_t                   *enerd,
+                        const gmx::MdrunScheduleWorkload &runScheduleWork,
+                        bool                              useGpuNonbonded,
+                        bool                              useGpuPme,
+                        int64_t                           step,
+                        gmx_wallcycle_t                   wcycle)
  {
      if (useGpuNonbonded)
      {
@@ -842,7 +837,7 @@ launchGpuEndOfStepTasks(nonbonded_verlet_t            *nbv,
          /* now clear the GPU outputs while we finish the step on the CPU */
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, mdScheduleWork.forceFlags.computeVirial);
+        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
@@ -852,7 +847,7 @@ launchGpuEndOfStepTasks(nonbonded_verlet_t            *nbv,
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
  
-    if (mdScheduleWork.forceWork.haveGpuBondedWork && mdScheduleWork.forceFlags.computeEnergy)
+    if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
      {
          // in principle this should be included in the DD balancing region,
          // but generally it is infrequent so we'll omit it for the sake of
@@ -887,7 +882,7 @@ void do_force(FILE                                     *fplog,
                gmx::ArrayRef<real>                       lambda,
                t_graph                                  *graph,
                t_forcerec                               *fr,
-              gmx::MdScheduleWorkload                  *mdScheduleWork,
+              gmx::MdrunScheduleWorkload               *runScheduleWork,
                const gmx_vsite_t                        *vsite,
                rvec                                      mu_tot,
                double                                    t,
@@ -908,11 +903,11 @@ void do_force(FILE                                     *fplog,
      {
          legacyFlags &= ~GMX_FORCE_NONBONDED;
      }
-    setupForceFlags(&mdScheduleWork->forceFlags, legacyFlags, fr->bNonbonded);
+    setupStepWorkload(&runScheduleWork->stepWork, legacyFlags, fr->bNonbonded);
  
-    const gmx::ForceFlags &forceFlags = mdScheduleWork->forceFlags;
+    const gmx::StepWorkload &stepWork = runScheduleWork->stepWork;
  
-    bFillGrid     = (forceFlags.doNeighborSearch && forceFlags.stateChanged);
+    bFillGrid     = (stepWork.doNeighborSearch && stepWork.stateChanged);
      bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
      bUseGPU       = fr->nbv->useGpu();
      bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
@@ -922,9 +917,9 @@ void do_force(FILE                                     *fplog,
      const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
          ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
      const int  pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
-        (forceFlags.computeVirial   ? GMX_PME_CALC_ENER_VIR : 0) |
-        (forceFlags.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0) |
-        (forceFlags.computeForces   ? GMX_PME_CALC_F : 0);
+        (stepWork.computeVirial   ? GMX_PME_CALC_ENER_VIR : 0) |
+        (stepWork.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0) |
+        (stepWork.computeForces   ? GMX_PME_CALC_F : 0);
  
      // Switches on whether to use GPU for position and force buffer operations
      // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
@@ -932,7 +927,7 @@ void do_force(FILE                                     *fplog,
          BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
      // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
      const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
-        && !(forceFlags.computeVirial || forceFlags.computeEnergy) ?
+        && !(stepWork.computeVirial || stepWork.computeEnergy) ?
          BufferOpsUseGpu::True : BufferOpsUseGpu::False;
      // TODO: move / add this flag to the internal PME GPU data structures
      const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
@@ -942,7 +937,7 @@ void do_force(FILE                                     *fplog,
       * somewhere early inside the step after communication during domain
       * decomposition (and not during the previous step as usual).
       */
-    if (forceFlags.doNeighborSearch)
+    if (stepWork.doNeighborSearch)
      {
          ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
      }
@@ -952,7 +947,7 @@ void do_force(FILE                                     *fplog,
  
      clear_mat(vir_force);
  
-    if (forceFlags.stateChanged)
+    if (stepWork.stateChanged)
      {
          if (inputrecNeedMutot(inputrec))
          {
@@ -970,7 +965,7 @@ void do_force(FILE                                     *fplog,
          /* Compute shift vectors every step,
           * because of pressure coupling or box deformation!
           */
-        if (forceFlags.haveDynamicBox && forceFlags.stateChanged)
+        if (stepWork.haveDynamicBox && stepWork.stateChanged)
          {
              calc_shifts(box, fr->shift_vec);
          }
@@ -986,7 +981,7 @@ void do_force(FILE                                     *fplog,
          }
      }
  
-    nbnxn_atomdata_copy_shiftvec(forceFlags.haveDynamicBox,
+    nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox,
                                   fr->shift_vec, nbv->nbat.get());
  
  #if GMX_MPI
@@ -999,20 +994,20 @@ void do_force(FILE                                     *fplog,
           */
          gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
                                   lambda[efptCOUL], lambda[efptVDW],
-                                 (forceFlags.computeVirial || forceFlags.computeEnergy),
+                                 (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, wcycle);
      }
  #endif /* GMX_MPI */
  
      if (useGpuPme)
      {
-        launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), forceFlags, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
      }
  
      /* do gridding for pair search */
-    if (forceFlags.doNeighborSearch)
+    if (stepWork.doNeighborSearch)
      {
-        if (graph && forceFlags.stateChanged)
+        if (graph && stepWork.stateChanged)
          {
              /* Calculate intramolecular shift vectors to make molecules whole */
              mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
@@ -1082,24 +1077,26 @@ void do_force(FILE                                     *fplog,
          }
      }
  
-    // Call it per-step as force-flags can change.
-    // Need to run after the GPU-offload bonded interaction lists
-    // are set up to be able to determine whether there is bonded work.
-    setupForceWorkload(&mdScheduleWork->forceWork,
-                       inputrec,
-                       fr,
-                       pull_work,
-                       ed,
-                       top->idef,
-                       fcd,
-                       forceFlags);
+    if (stepWork.doNeighborSearch)
+    {
+        // Need to run after the GPU-offload bonded interaction lists
+        // are set up to be able to determine whether there is bonded work.
+        setupDomainLifetimeWorkload(&runScheduleWork->domainWork,
+                                    inputrec,
+                                    fr,
+                                    pull_work,
+                                    ed,
+                                    top->idef,
+                                    fcd,
+                                    stepWork);
+    }
  
-    const gmx::PpForceWorkload &forceWork = mdScheduleWork->forceWork;
+    const gmx::DomainLifetimeWorkload &domainWork = runScheduleWork->domainWork;
  
      /* do local pair search */
-    if (forceFlags.doNeighborSearch)
+    if (stepWork.doNeighborSearch)
      {
-        // TODO: fuse this branch with the above forceFlags.doNeighborSearch block
+        // TODO: fuse this branch with the above stepWork.doNeighborSearch block
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
@@ -1151,7 +1148,7 @@ void do_force(FILE                                     *fplog,
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (forceFlags.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+        if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
          {
              Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
                                        Nbnxm::AtomLocality::Local);
@@ -1161,16 +1158,16 @@ void do_force(FILE                                     *fplog,
  
          // bonded work not split into separate local and non-local, so with DD
          // we can only launch the kernel after non-local coordinates have been received.
-        if (forceWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
          {
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-            fr->gpuBonded->launchKernel(fr, forceFlags, box);
+            fr->gpuBonded->launchKernel(fr, stepWork, box);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
          }
  
          /* launch local nonbonded work on GPU */
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1195,9 +1192,9 @@ void do_force(FILE                                     *fplog,
         do non-local pair search */
      if (havePPDomainDecomposition(cr))
      {
-        if (forceFlags.doNeighborSearch)
+        if (stepWork.doNeighborSearch)
          {
-            // TODO: fuse this branch with the above large forceFlags.doNeighborSearch block
+            // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
              wallcycle_start_nocount(wcycle, ewcNS);
              wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
@@ -1223,7 +1220,7 @@ void do_force(FILE                                     *fplog,
                  gpuHaloExchange->communicateHaloCoordinates(box);
  
                  // TODO Force flags should include haveFreeEnergyWork for this domain
-                if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+                if (domainWork.haveCpuBondedWork || (fr->efep != efepNO))
                  {
                      //non-local part of coordinate buffer must be copied back to host for CPU work
                      nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
@@ -1257,7 +1254,7 @@ void do_force(FILE                                     *fplog,
          {
              wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
-            if (forceFlags.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+            if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                  Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
@@ -1265,16 +1262,16 @@ void do_force(FILE                                     *fplog,
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
              }
  
-            if (forceWork.haveGpuBondedWork)
+            if (domainWork.haveGpuBondedWork)
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-                fr->gpuBonded->launchKernel(fr, forceFlags, box);
+                fr->gpuBonded->launchKernel(fr, stepWork, box);
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
              }
  
              /* launch non-local nonbonded tasks on GPU */
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1293,20 +1290,20 @@ void do_force(FILE                                     *fplog,
          if (havePPDomainDecomposition(cr))
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      forceFlags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+                                      stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
          }
          Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  forceFlags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+                                  stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
-        if (forceWork.haveGpuBondedWork && forceFlags.computeEnergy)
+        if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
          {
              fr->gpuBonded->launchEnergyTransfer();
          }
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
-    if (forceFlags.stateChanged && inputrecNeedMutot(inputrec))
+    if (stepWork.stateChanged && inputrecNeedMutot(inputrec))
      {
          if (PAR(cr))
          {
@@ -1355,7 +1352,7 @@ void do_force(FILE                                     *fplog,
      if (inputrec->bRot)
      {
          wallcycle_start(wcycle, ewcROT);
-        do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, forceFlags.doNeighborSearch);
+        do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, stepWork.doNeighborSearch);
          wallcycle_stop(wcycle, ewcROT);
      }
  
@@ -1366,7 +1363,7 @@ void do_force(FILE                                     *fplog,
  
      // Set up and clear force outputs.
      // We use std::move to keep the compiler happy, it has no effect.
-    ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, std::move(force), forceFlags, wcycle);
+    ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, std::move(force), stepWork, wcycle);
  
      /* We calculate the non-bonded forces, when done on the CPU, here.
       * We do this before calling do_force_lowlevel, because in that
@@ -1378,7 +1375,7 @@ void do_force(FILE                                     *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                       step, nrnb, wcycle);
      }
  
@@ -1390,14 +1387,14 @@ void do_force(FILE                                     *fplog,
          nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
                                        fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
                                        inputrec->fepvals, lambda.data(),
-                                      enerd, forceFlags, nrnb);
+                                      enerd, stepWork, nrnb);
  
          if (havePPDomainDecomposition(cr))
          {
              nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
                                            fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
                                            inputrec->fepvals, lambda.data(),
-                                          enerd, forceFlags, nrnb);
+                                          enerd, stepWork, nrnb);
          }
      }
  
@@ -1405,11 +1402,11 @@ void do_force(FILE                                     *fplog,
      {
          if (havePPDomainDecomposition(cr))
          {
-            do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
          }
  
-        if (forceFlags.computeForces)
+        if (stepWork.computeForces)
          {
              /* Add all the non-bonded force to the normal force array.
               * This can be split into a local and a non-local part when overlapping
@@ -1421,7 +1418,7 @@ void do_force(FILE                                     *fplog,
          }
  
          /* If there are multiple fshift output buffers we need to reduce them */
-        if (forceFlags.computeVirial)
+        if (stepWork.computeVirial)
          {
              /* This is not in a subcounter because it takes a
                 negligible and constant-sized amount of time */
@@ -1438,7 +1435,7 @@ void do_force(FILE                                     *fplog,
  
      // TODO Force flags should include haveFreeEnergyWork for this domain
      if (ddUsesGpuDirectCommunication &&
-        (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+        (domainWork.haveCpuBondedWork || (fr->efep != efepNO)))
      {
          /* Wait for non-local coordinate data to be copied from device */
          nbv->wait_nonlocal_x_copy_D2H_done();
@@ -1448,7 +1445,7 @@ void do_force(FILE                                     *fplog,
                        cr, ms, nrnb, wcycle, mdatoms,
                        x, hist, &forceOut, enerd, fcd,
                        box, lambda.data(), graph, fr->mu_tot,
-                      forceFlags,
+                      stepWork,
                        ddBalanceRegionHandler);
  
      wallcycle_stop(wcycle, ewcFORCE);
@@ -1456,8 +1453,9 @@ void do_force(FILE                                     *fplog,
      computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
                           imdSession, pull_work, step, t, wcycle,
                           fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda.data(),
-                         forceFlags, &forceOut.forceWithVirial(), enerd,
-                         ed, forceFlags.doNeighborSearch);
+                         stepWork, &forceOut.forceWithVirial(), enerd,
+                         ed, stepWork.doNeighborSearch);
+
  
      // Will store the amount of cycles spent waiting for the GPU that
      // will be later used in the DLB accounting.
@@ -1472,7 +1470,7 @@ void do_force(FILE                                     *fplog,
              if (bUseGPU)
              {
                  cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                                               forceFlags, Nbnxm::AtomLocality::NonLocal,
+                                                               stepWork, Nbnxm::AtomLocality::NonLocal,
                                                                 enerd->grpp.ener[egLJSR].data(),
                                                                 enerd->grpp.ener[egCOULSR].data(),
                                                                 forceWithShiftForces.shiftForces(),
@@ -1481,7 +1479,7 @@ void do_force(FILE                                     *fplog,
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                               step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
@@ -1491,7 +1489,7 @@ void do_force(FILE                                     *fplog,
                  // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
                  // The bonded and free energy CPU tasks can have non-local force contributions
                  // which are a dependency for the GPU force reduction.
-                bool  haveNonLocalForceContribInCpuBuffer = forceWork.haveCpuBondedWork || (fr->efep != efepNO);
+                bool  haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || (fr->efep != efepNO);
  
                  rvec *f = as_rvec_array(forceWithShiftForces.force().data());
                  if (haveNonLocalForceContribInCpuBuffer)
@@ -1512,7 +1510,7 @@ void do_force(FILE                                     *fplog,
              }
  
  
-            if (fr->nbv->emulateGpu() && forceFlags.computeVirial)
+            if (fr->nbv->emulateGpu() && stepWork.computeVirial)
              {
                  nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat,
                                                           forceWithShiftForces.shiftForces());
@@ -1523,7 +1521,7 @@ void do_force(FILE                                     *fplog,
      const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
      const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
      // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
-    const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+    const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
                                           (fr->efep != efepNO));
  
      if (havePPDomainDecomposition(cr))
@@ -1535,7 +1533,7 @@ void do_force(FILE                                     *fplog,
           */
          ddBalanceRegionHandler.closeAfterForceComputationCpu();
  
-        if (forceFlags.computeForces)
+        if (stepWork.computeForces)
          {
              gmx::ArrayRef<gmx::RVec>  force  = forceOut.forceWithShiftForces().force();
              rvec                     *f      = as_rvec_array(force.data());
@@ -1568,7 +1566,7 @@ void do_force(FILE                                     *fplog,
      if (alternateGpuWait)
      {
          alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd,
-                                    forceFlags, pmeFlags, wcycle);
+                                    stepWork, pmeFlags, wcycle);
      }
  
      if (!alternateGpuWait && useGpuPme)
@@ -1587,7 +1585,7 @@ void do_force(FILE                                     *fplog,
          const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
          const float waitCycles               =
              Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                        forceFlags, Nbnxm::AtomLocality::Local,
+                                        stepWork, Nbnxm::AtomLocality::Local,
                                          enerd->grpp.ener[egLJSR].data(),
                                          enerd->grpp.ener[egCOULSR].data(),
                                          forceOut.forceWithShiftForces().shiftForces(),
@@ -1596,7 +1594,7 @@ void do_force(FILE                                     *fplog,
          if (ddBalanceRegionHandler.useBalancingRegion())
          {
              DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
-            if (forceFlags.computeForces &&  waitCycles <= gpuWaitApiOverheadMargin)
+            if (stepWork.computeForces &&  waitCycles <= gpuWaitApiOverheadMargin)
              {
                  /* We measured few cycles, it could be that the kernel
                   * and transfer finished earlier and there was no actual
@@ -1615,7 +1613,7 @@ void do_force(FILE                                     *fplog,
          // NOTE: emulation kernel is not included in the balancing region,
          // but emulation mode does not target performance anyway
          wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local,
+        do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local,
                       DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                       step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcFORCE);
@@ -1672,7 +1670,7 @@ void do_force(FILE                                     *fplog,
      }
  
      launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd,
-                            *mdScheduleWork,
+                            *runScheduleWork,
                              bUseGPU, useGpuPme,
                              step,
                              wcycle);
@@ -1682,21 +1680,21 @@ void do_force(FILE                                     *fplog,
          dd_force_flop_stop(cr->dd, nrnb);
      }
  
-    if (forceFlags.computeForces)
+    if (stepWork.computeForces)
      {
          rvec *f = as_rvec_array(forceOut.forceWithShiftForces().force().data());
  
          /* If we have NoVirSum forces, but we do not calculate the virial,
           * we sum fr->f_novirsum=forceOut.f later.
           */
-        if (vsite && !(fr->haveDirectVirialContributions && !forceFlags.computeVirial))
+        if (vsite && !(fr->haveDirectVirialContributions && !stepWork.computeVirial))
          {
              rvec *fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
              spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE, nullptr, nrnb,
                             &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
          }
  
-        if (forceFlags.computeVirial)
+        if (stepWork.computeVirial)
          {
              /* Calculation of the virial must be done after vsites! */
              calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
@@ -1713,15 +1711,15 @@ void do_force(FILE                                     *fplog,
          pme_receive_force_ener(cr, &forceOut.forceWithVirial(), enerd, wcycle);
      }
  
-    if (forceFlags.computeForces)
+    if (stepWork.computeForces)
      {
          post_process_forces(cr, step, nrnb, wcycle,
                              top, box, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut,
                              vir_force, mdatoms, graph, fr, vsite,
-                            forceFlags);
+                            stepWork);
      }
  
-    if (forceFlags.computeEnergy)
+    if (stepWork.computeEnergy)
      {
          /* Sum the potential energy terms from group contributions */
          sum_epot(&(enerd->grpp), enerd->term);