Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 342c08bb625d865c9eb287baee7ec84e4279ae4c..a2ccaed47b15b75440558e9d7ad4f433f519a765 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -54,6 +54,7 @@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  #include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
@@ -63,7 +64,7 @@
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/listed_forces_gpu.h"
  #include "gromacs/listed_forces/listed_forces.h"
  #include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/arrayrefwithpadding.h"
@@ -207,7 +208,7 @@ static void pull_potential_wrapper(const t_commrec*               cr,
      enerd->term[F_COM_PULL] +=
              pull_potential(pull_work,
                             gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
-                           &pbc,
+                           pbc,
                             cr,
                             t,
                             lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Restraint)],
@@ -746,7 +747,10 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                        gmx_wallcycle*        wcycle)
  {
      pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+    bool                           useGpuDirectComm         = false;
+    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+    pme_gpu_launch_spread(
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
@@ -925,7 +929,8 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&
              domainWork.haveCpuBondedWork = true;
          }
      }
-    domainWork.haveGpuBondedWork = ((fr.gpuBonded != nullptr) && fr.gpuBonded->haveInteractions());
+    domainWork.haveGpuBondedWork =
+            ((fr.listedForcesGpu != nullptr) && fr.listedForcesGpu->haveInteractions());
      // Note that haveFreeEnergyWork is constant over the whole run
      domainWork.haveFreeEnergyWork =
              (fr.efep != FreeEnergyPerturbationType::No && mdatoms.nPerturbed != 0);
@@ -935,6 +940,10 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&
              domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork
              || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
              || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
+    domainWork.haveLocalForceContribInCpuBuffer =
+            domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
+    domainWork.haveNonLocalForceContribInCpuBuffer =
+            domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
  
      return domainWork;
  }
@@ -945,28 +954,27 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&
   * \param[in]      mtsLevels            The multiple time-stepping levels, either empty or 2 levels
   * \param[in]      step                 The current MD step
   * \param[in]      simulationWork       Simulation workload description.
- * \param[in]      rankHasPmeDuty       If this rank computes PME.
   *
   * \returns New Stepworkload description.
   */
  static StepWorkload setupStepWorkload(const int                     legacyFlags,
                                        ArrayRef<const gmx::MtsLevel> mtsLevels,
                                        const int64_t                 step,
-                                      const SimulationWorkload&     simulationWork,
-                                      const bool                    rankHasPmeDuty)
+                                      const SimulationWorkload&     simulationWork)
  {
      GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
      const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
  
      StepWorkload flags;
-    flags.stateChanged        = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
-    flags.haveDynamicBox      = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
-    flags.doNeighborSearch    = ((legacyFlags & GMX_FORCE_NS) != 0);
-    flags.computeSlowForces   = computeSlowForces;
-    flags.computeVirial       = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
-    flags.computeEnergy       = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
-    flags.computeForces       = ((legacyFlags & GMX_FORCE_FORCES) != 0);
-    flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
+    flags.stateChanged                  = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
+    flags.haveDynamicBox                = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
+    flags.doNeighborSearch              = ((legacyFlags & GMX_FORCE_NS) != 0);
+    flags.computeSlowForces             = computeSlowForces;
+    flags.computeVirial                 = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
+    flags.computeEnergy                 = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
+    flags.computeForces                 = ((legacyFlags & GMX_FORCE_FORCES) != 0);
+    flags.useOnlyMtsCombinedForceBuffer = ((legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0);
+    flags.computeListedForces           = ((legacyFlags & GMX_FORCE_LISTED) != 0);
      flags.computeNonbondedForces =
              ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
              && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
@@ -979,11 +987,17 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
      }
      flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
      // on virial steps the CPU reduction path is taken
-    flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
-    flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
-                                && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
-    flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
-    flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.useGpuFBufferOps       = simulationWork.useGpuBufferOps && !flags.computeVirial;
+    const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
+    flags.useGpuPmeFReduction    = flags.computeSlowForces && flags.useGpuFBufferOps
+                                && (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
+    flags.useGpuXHalo          = simulationWork.useGpuHaloExchange && !flags.doNeighborSearch;
+    flags.useGpuFHalo          = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
+    flags.haveGpuPmeOnThisRank = rankHasGpuPmeTask && flags.computeSlowForces;
+    flags.combineMtsForcesBeforeHaloExchange =
+            (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
+             && flags.useOnlyMtsCombinedForceBuffer
+             && !(flags.computeVirial || simulationWork.useGpuNonbonded || flags.haveGpuPmeOnThisRank));
  
      return flags;
  }
@@ -991,15 +1005,12 @@ static StepWorkload setupStepWorkload(const int                     legacyFlags,
  
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
- * TODO: eliminate \p useGpuPmeOnThisRank when this is
- * incorporated in DomainLifetimeWorkload.
   */
  static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
-                                    gmx::GpuBonded*                   gpuBonded,
+                                    gmx::ListedForcesGpu*             listedForcesGpu,
                                      gmx_pme_t*                        pmedata,
                                      gmx_enerdata_t*                   enerd,
                                      const gmx::MdrunScheduleWorkload& runScheduleWork,
-                                    bool                              useGpuPmeOnThisRank,
                                      int64_t                           step,
                                      gmx_wallcycle*                    wcycle)
  {
@@ -1022,7 +1033,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
          wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
      {
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
@@ -1032,9 +1043,9 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
          // in principle this should be included in the DD balancing region,
          // but generally it is infrequent so we'll omit it for the sake of
          // simpler code
-        gpuBonded->waitAccumulateEnergyTerms(enerd);
+        listedForcesGpu->waitAccumulateEnergyTerms(enerd);
  
-        gpuBonded->clearEnergies();
+        listedForcesGpu->clearEnergies();
      }
  }
  
@@ -1084,7 +1095,7 @@ static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
      }
  }
  
-/*! \brief Combines MTS level0 and level1 force buffes into a full and MTS-combined force buffer.
+/*! \brief Combines MTS level0 and level1 force buffers into a full and MTS-combined force buffer.
   *
   * \param[in]     numAtoms        The number of atoms to combine forces for
   * \param[in,out] forceMtsLevel0  Input: F_level0, output: F_level0 + F_level1
@@ -1122,47 +1133,58 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
      // (re-)initialize local GPU force reduction
-    const bool accumulate =
-            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+    const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
+                            || runScheduleWork->simulationWork.havePpDomainDecomposition;
      const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
-                                                            nbv->getNumAtoms(AtomLocality::Local),
-                                                            nbv->getGridIndices(),
-                                                            atomStart,
-                                                            accumulate,
-                                                            stateGpu->fReducedOnDevice());
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+            stateGpu->getForces(),
+            nbv->getNumAtoms(AtomLocality::Local),
+            nbv->getGridIndices(),
+            atomStart,
+            accumulate,
+            stateGpu->fReducedOnDevice(AtomLocality::Local));
  
      // register forces and add dependencies
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
  
      if (runScheduleWork->simulationWork.useGpuPme
-        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+        && (!runScheduleWork->simulationWork.haveSeparatePmeRank
+            || runScheduleWork->simulationWork.useGpuPmePpCommunication))
      {
          DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+                runScheduleWork->simulationWork.haveSeparatePmeRank
+                        ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
+                        : pme_gpu_get_device_f(fr->pmedata);        // PME force buffer on same GPU
          fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
  
-        GpuEventSynchronizer* const pmeSynchronizer =
-                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                               : // PME force buffer on same GPU
-                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-
-        if (GMX_THREAD_MPI)
+        if (runScheduleWork->simulationWork.haveSeparatePmeRank)
+        {
+            // PME force buffer on remote GPU -
+            // event synchronizer received from other GPU only in case of thread-mpi
+            if (GMX_THREAD_MPI)
+            {
+                GpuEventSynchronizer* const pmeSynchronizer =
+                        fr->pmePpCommGpu->getForcesReadySynchronizer();
+                GMX_ASSERT(pmeSynchronizer != nullptr,
+                           "PME force ready cuda event should not be NULL");
+                fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+            }
+        }
+        else
          {
+            // PME force buffer on same GPU - add dependency on PME force computation
+            GpuEventSynchronizer* const pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(fr->pmedata);
              GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
              fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
          }
      }
  
-    if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
-        && !runScheduleWork->simulationWork.useGpuHaloExchange)
+    if (runScheduleWork->domainWork.haveCpuLocalForceWork
+        || (runScheduleWork->simulationWork.havePpDomainDecomposition
+            && !runScheduleWork->simulationWork.useGpuHaloExchange))
      {
-        auto forcesReadyLocality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-        const bool useGpuForceBufferOps = true;
          fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                stateGpu->getForcesReadyOnDeviceEvent(forcesReadyLocality, useGpuForceBufferOps));
+                stateGpu->fReadyOnDevice(AtomLocality::Local));
      }
  
      if (runScheduleWork->simulationWork.useGpuHaloExchange)
@@ -1171,29 +1193,43 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                  cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (runScheduleWork->simulationWork.havePpDomainDecomposition)
      {
          // (re-)initialize non-local GPU force reduction
          const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
                                  || runScheduleWork->domainWork.haveFreeEnergyWork;
          const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
-                                                                   nbv->getNumAtoms(AtomLocality::NonLocal),
-                                                                   nbv->getGridIndices(),
-                                                                   atomStart,
-                                                                   accumulate);
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+                stateGpu->getForces(),
+                nbv->getNumAtoms(AtomLocality::NonLocal),
+                nbv->getGridIndices(),
+                atomStart,
+                accumulate,
+                stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
  
          // register forces and add dependencies
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
-        if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
+                Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+        if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
          {
              fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
-                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
+                    stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
          }
      }
  }
  
  
+/*! \brief Return the number of local atoms.
+ */
+static int getLocalAtomCount(const gmx_domdec_t* dd, const t_mdatoms& mdatoms, bool havePPDomainDecomposition)
+{
+    GMX_ASSERT(!(havePPDomainDecomposition && (dd == nullptr)),
+               "Can't have PP decomposition with dd uninitialized!");
+    return havePPDomainDecomposition ? dd_numAtomsZones(*dd) : mdatoms.homenr;
+}
+
+
  void do_force(FILE*                               fplog,
                const t_commrec*                    cr,
                const gmx_multisim_t*               ms,
@@ -1220,6 +1256,7 @@ void do_force(FILE*                               fplog,
                rvec                                muTotal,
                double                              t,
                gmx_edsam*                          ed,
+              CpuPpLongRangeNonbondeds*           longRangeNonbondeds,
                int                                 legacyFlags,
                const DDBalanceRegionHandler&       ddBalanceRegionHandler)
  {
@@ -1235,12 +1272,19 @@ void do_force(FILE*                               fplog,
  
      const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
  
-    runScheduleWork->stepWork = setupStepWorkload(
-            legacyFlags, inputrec.mtsLevels, step, simulationWork, thisRankHasDuty(cr, DUTY_PME));
+    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
      const StepWorkload& stepWork = runScheduleWork->stepWork;
  
-    const bool useGpuPmeOnThisRank =
-            simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces;
+    if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+    {
+        // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+        // First clear local portion of force array, so that untouched atoms are zero.
+        // The dependency for this is that forces from previous timestep have been consumed,
+        // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+        stateGpu->clearForcesOnGpu(AtomLocality::Local,
+                                   stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                           AtomLocality::Local, simulationWork, stepWork));
+    }
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
@@ -1264,7 +1308,7 @@ void do_force(FILE*                               fplog,
          }
  
          const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
-        const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
+        const bool calcCGCM = (fillGrid && !haveDDAtomOrdering(*cr));
          if (calcCGCM)
          {
              put_atoms_in_box_omp(fr->pbcType,
@@ -1278,34 +1322,30 @@ void do_force(FILE*                               fplog,
      nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
  
      const bool pmeSendCoordinatesFromGpu =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+            simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
      const bool reinitGpuPmePpComms =
-            GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
+            simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
  
-    auto* localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
+    auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
                                          ? stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                  AtomLocality::Local, simulationWork, stepWork)
+                                                AtomLocality::Local, simulationWork, stepWork)
                                          : nullptr;
  
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
      // virial, or if host-side data will be transferred from this task
      // to a remote task for halo exchange or PME-PP communication. At
      // search steps the current coordinates are already on the host,
      // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
          && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
+            || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
+            || simulationWork.computeMuTot))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
          haveCopiedXFromGpu = true;
@@ -1315,14 +1355,14 @@ void do_force(FILE*                               fplog,
      // The local coordinates can be copied right away.
      // NOTE: Consider moving this copy to right after they are updated and constrained,
      //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
+    if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
      {
          if (stepWork.doNeighborSearch)
          {
              // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
-                             cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
-            if (useGpuPmeOnThisRank)
+                             getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
+            if (stepWork.haveGpuPmeOnThisRank)
              {
                  // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
                  pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
@@ -1338,7 +1378,7 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces)
+    if (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
      {
          /* Send particle coordinates to the pme nodes */
          if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
@@ -1351,7 +1391,7 @@ void do_force(FILE*                               fplog,
          gmx_pme_send_coordinates(fr,
                                   cr,
                                   box,
-                                 as_rvec_array(x.unpaddedArrayRef().data()),
+                                 x.unpaddedArrayRef(),
                                   lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
                                   lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
                                   (stepWork.computeVirial || stepWork.computeEnergy),
@@ -1359,11 +1399,12 @@ void do_force(FILE*                               fplog,
                                   simulationWork.useGpuPmePpCommunication,
                                   reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu,
+                                 stepWork.useGpuPmeFReduction,
                                   localXReadyOnDevice,
                                   wcycle);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (stepWork.haveGpuPmeOnThisRank)
      {
          launchPmeGpuSpread(fr->pmedata,
                             box,
@@ -1384,7 +1425,7 @@ void do_force(FILE*                               fplog,
          }
  
          wallcycle_start(wcycle, WallCycleCounter::NS);
-        if (!DOMAINDECOMP(cr))
+        if (!haveDDAtomOrdering(*cr))
          {
              const rvec vzero       = { 0.0_real, 0.0_real, 0.0_real };
              const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
@@ -1397,7 +1438,7 @@ void do_force(FILE*                               fplog,
                                nullptr,
                                { 0, mdatoms->homenr },
                                -1,
-                              fr->cginfo,
+                              fr->atomInfo,
                                x.unpaddedArrayRef(),
                                0,
                                nullptr);
@@ -1406,27 +1447,27 @@ void do_force(FILE*                               fplog,
          else
          {
              wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
-            nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef());
+            nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->atomInfo, x.unpaddedArrayRef());
              wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
          }
  
          nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
                                 gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
-                               fr->cginfo);
+                               fr->atomInfo);
  
          wallcycle_stop(wcycle, WallCycleCounter::NS);
  
          /* initialize the GPU nbnxm atom data and bonded data structures */
          if (simulationWork.useGpuNonbonded)
          {
-            // Note: cycle counting only nononbondeds, gpuBonded counts internally
+            // Note: cycle counting only nononbondeds, GPU listed forces counts internally
              wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
              wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
              Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
              wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
              wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
  
-            if (fr->gpuBonded)
+            if (fr->listedForcesGpu)
              {
                  /* Now we put all atoms on the grid, we can assign bonded
                   * interactions to the GPU, where the grid order is
@@ -1436,11 +1477,12 @@ void do_force(FILE*                               fplog,
                  // TODO the xq, f, and fshift buffers are now shared
                  // resources, so they should be maintained by a
                  // higher-level object than the nb module.
-                fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbv->getGridIndices(),
-                                                                      top->idef,
-                                                                      Nbnxm::gpu_get_xq(nbv->gpu_nbv),
-                                                                      Nbnxm::gpu_get_f(nbv->gpu_nbv),
-                                                                      Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
+                fr->listedForcesGpu->updateInteractionListsAndDeviceBuffers(
+                        nbv->getGridIndices(),
+                        top->idef,
+                        Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                        Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                        Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
              }
          }
  
@@ -1454,7 +1496,7 @@ void do_force(FILE*                               fplog,
          /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
          nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
  
-        nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
+        nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::Local);
  
          wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
          wallcycle_stop(wcycle, WallCycleCounter::NS);
@@ -1506,9 +1548,9 @@ void do_force(FILE*                               fplog,
  
          // bonded work not split into separate local and non-local, so with DD
          // we can only launch the kernel after non-local coordinates have been received.
-        if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+        if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
          {
-            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
+            fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
          }
  
          /* launch local nonbonded work on GPU */
@@ -1519,7 +1561,7 @@ void do_force(FILE*                               fplog,
          wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
      }
  
-    if (useGpuPmeOnThisRank)
+    if (stepWork.haveGpuPmeOnThisRank)
      {
          // In PME GPU and mixed mode we launch FFT / gather after the
          // X copy/transform to allow overlap as well as after the GPU NB
@@ -1533,7 +1575,7 @@ void do_force(FILE*                               fplog,
  
      /* Communicate coordinates and sum dipole if necessary +
         do non-local pair search */
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          if (stepWork.doNeighborSearch)
          {
@@ -1543,7 +1585,7 @@ void do_force(FILE*                               fplog,
              /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
              nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
  
-            nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
+            nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
              wallcycle_stop(wcycle, WallCycleCounter::NS);
              // TODO refactor this GPU halo exchange re-initialisation
@@ -1557,16 +1599,18 @@ void do_force(FILE*                               fplog,
          }
          else
          {
+            GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
              if (stepWork.useGpuXHalo)
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
-                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+                gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
                      // non-local part of coordinate buffer must be copied back to host for CPU work
-                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+                    stateGpu->copyCoordinatesFromGpu(
+                            x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
                  }
              }
              else
@@ -1582,14 +1626,15 @@ void do_force(FILE*                               fplog,
  
              if (stepWork.useGpuXBufferOps)
              {
-                if (!useGpuPmeOnThisRank && !stepWork.useGpuXHalo)
+                if (!stepWork.useGpuXHalo)
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                  }
-                nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
-                                           stateGpu->getCoordinates(),
-                                           stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                   AtomLocality::NonLocal, simulationWork, stepWork));
+                nbv->convertCoordinatesGpu(
+                        AtomLocality::NonLocal,
+                        stateGpu->getCoordinates(),
+                        stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
              }
              else
              {
@@ -1611,7 +1656,7 @@ void do_force(FILE*                               fplog,
  
              if (domainWork.haveGpuBondedWork)
              {
-                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
+                fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
              }
  
              /* launch non-local nonbonded tasks on GPU */
@@ -1623,13 +1668,22 @@ void do_force(FILE*                               fplog,
          }
      }
  
+    // With FEP we set up the reduction over threads for local+non-local simultaneously,
+    // so we need to do that here after the local and non-local pairlist construction.
+    if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No)
+    {
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
+        nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
+    }
+
      if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
      {
          /* launch D2H copy-back F */
          wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
          wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
  
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
          }
@@ -1638,7 +1692,7 @@ void do_force(FILE*                               fplog,
  
          if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
          {
-            fr->gpuBonded->launchEnergyTransfer();
+            fr->listedForcesGpu->launchEnergyTransfer();
          }
          wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
      }
@@ -1649,18 +1703,26 @@ void do_force(FILE*                               fplog,
          xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
      }
  
-    DipoleData dipoleData;
-
-    if (simulationWork.computeMuTot)
+    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
+    // this wait ensures that the D2H transfer is complete.
+    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
      {
-        const int start = 0;
-
-        if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
+        const bool needCoordsOnHost  = (runScheduleWork->domainWork.haveCpuLocalForceWork
+                                       || stepWork.computeVirial || simulationWork.computeMuTot);
+        const bool haveAlreadyWaited = simulationWork.useCpuHaloExchange;
+        if (needCoordsOnHost && !haveAlreadyWaited)
          {
              GMX_ASSERT(haveCopiedXFromGpu,
                         "a wait should only be triggered if copy has been scheduled");
              stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
          }
+    }
+
+    DipoleData dipoleData;
+
+    if (simulationWork.computeMuTot)
+    {
+        const int start = 0;
  
          /* Calculate total (local) dipole moment in a temporary common array.
           * This makes it possible to sum them over nodes faster.
@@ -1670,8 +1732,10 @@ void do_force(FILE*                               fplog,
          calc_mu(start,
                  mdatoms->homenr,
                  xRef,
-                gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
-                gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr),
+                mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
+                                 : gmx::ArrayRef<real>{},
+                mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
+                                 : gmx::ArrayRef<real>{},
                  mdatoms->nChargePerturbed != 0,
                  dipoleData.muStaging[0],
                  dipoleData.muStaging[1]);
@@ -1683,21 +1747,12 @@ void do_force(FILE*                               fplog,
      /* Reset energies */
      reset_enerdata(enerd);
  
-    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
+    if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
      {
          wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
          dd_force_flop_start(cr->dd, nrnb);
      }
  
-    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
-    // this wait ensures that the D2H transfer is complete.
-    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
-    {
-        GMX_ASSERT(haveCopiedXFromGpu, "a wait should only be triggered if copy has been scheduled");
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
-    }
-
      if (inputrec.bRot)
      {
          wallcycle_start(wcycle, WallCycleCounter::Rot);
@@ -1718,21 +1773,22 @@ void do_force(FILE*                               fplog,
       * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
       */
      ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
-            &fr->forceHelperBuffers[0], force, domainWork, stepWork, havePPDomainDecomposition(cr), wcycle);
+            &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
  
      // Force output for MTS combined forces, only set at level1 MTS steps
      std::optional<ForceOutputs> forceOutMts =
-            (fr->useMts && stepWork.computeSlowForces)
+            (simulationWork.useMts && stepWork.computeSlowForces)
                      ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
                                                        forceView->forceMtsCombinedWithPadding(),
                                                        domainWork,
                                                        stepWork,
-                                                      havePPDomainDecomposition(cr),
+                                                      simulationWork.havePpDomainDecomposition,
                                                        wcycle))
                      : std::nullopt;
  
      ForceOutputs* forceOutMtsLevel1 =
-            fr->useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr) : &forceOutMtsLevel0;
+            simulationWork.useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr)
+                                  : &forceOutMtsLevel0;
  
      const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
  
@@ -1763,41 +1819,34 @@ void do_force(FILE*                               fplog,
          /* Calculate the local and non-local free energy interactions here.
           * Happens here on the CPU both with and without GPU.
           */
-        nbv->dispatchFreeEnergyKernel(InteractionLocality::Local,
-                                      *fr,
-                                      x.unpaddedArrayRef(),
-                                      &forceOutNonbonded->forceWithShiftForces(),
-                                      gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
-                                      gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr),
-                                      gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr),
-                                      gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr),
-                                      inputrec.fepvals.get(),
-                                      lambda,
-                                      enerd,
-                                      stepWork,
-                                      nrnb);
-
-        if (havePPDomainDecomposition(cr))
-        {
-            nbv->dispatchFreeEnergyKernel(InteractionLocality::NonLocal,
-                                          *fr,
-                                          x.unpaddedArrayRef(),
-                                          &forceOutNonbonded->forceWithShiftForces(),
-                                          gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
-                                          gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr),
-                                          gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr),
-                                          gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr),
-                                          inputrec.fepvals.get(),
-                                          lambda,
-                                          enerd,
-                                          stepWork,
-                                          nrnb);
-        }
+        nbv->dispatchFreeEnergyKernels(
+                x,
+                &forceOutNonbonded->forceWithShiftForces(),
+                fr->use_simd_kernels,
+                fr->ntype,
+                fr->rlist,
+                *fr->ic,
+                fr->shift_vec,
+                fr->nbfp,
+                fr->ljpme_c6grid,
+                mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
+                                 : gmx::ArrayRef<real>{},
+                mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
+                                 : gmx::ArrayRef<real>{},
+                mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
+                               : gmx::ArrayRef<int>{},
+                mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
+                               : gmx::ArrayRef<int>{},
+                inputrec.fepvals.get(),
+                lambda,
+                enerd,
+                stepWork,
+                nrnb);
      }
  
      if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
      {
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
          }
@@ -1841,9 +1890,12 @@ void do_force(FILE*                               fplog,
          real dvdl_walls = do_walls(inputrec,
                                     *fr,
                                     box,
-                                   gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr),
-                                   gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr),
-                                   gmx::arrayRefFromArray(mdatoms->cENER, mdatoms->nr),
+                                   mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
+                                                  : gmx::ArrayRef<int>{},
+                                   mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
+                                                  : gmx::ArrayRef<int>{},
+                                   mdatoms->cENER ? gmx::arrayRefFromArray(mdatoms->cENER, mdatoms->nr)
+                                                  : gmx::ArrayRef<unsigned short>{},
                                     mdatoms->homenr,
                                     mdatoms->nPerturbed,
                                     x.unpaddedConstArrayRef(),
@@ -1873,10 +1925,11 @@ void do_force(FILE*                               fplog,
              /* Since all atoms are in the rectangular or triclinic unit-cell,
               * only single box vector shifts (2 in x) are required.
               */
-            set_pbc_dd(&pbc, fr->pbcType, DOMAINDECOMP(cr) ? cr->dd->numCells : nullptr, TRUE, box);
+            set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
          }
  
-        for (int mtsIndex = 0; mtsIndex < (fr->useMts && stepWork.computeSlowForces ? 2 : 1); mtsIndex++)
+        for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
+             mtsIndex++)
          {
              ListedForces& listedForces = fr->listedForces[mtsIndex];
              ForceOutputs& forceOut     = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
@@ -1896,27 +1949,23 @@ void do_force(FILE*                               fplog,
                                     nrnb,
                                     lambda,
                                     mdatoms,
-                                   DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+                                   haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
                                     stepWork);
          }
      }
  
      if (stepWork.computeSlowForces)
      {
-        calculateLongRangeNonbondeds(fr,
-                                     inputrec,
-                                     cr,
-                                     nrnb,
-                                     wcycle,
-                                     mdatoms,
-                                     x.unpaddedConstArrayRef(),
-                                     &forceOutMtsLevel1->forceWithVirial(),
-                                     enerd,
-                                     box,
-                                     lambda,
-                                     dipoleData.muStateAB,
-                                     stepWork,
-                                     ddBalanceRegionHandler);
+        longRangeNonbondeds->calculate(fr->pmedata,
+                                       cr,
+                                       x.unpaddedConstArrayRef(),
+                                       &forceOutMtsLevel1->forceWithVirial(),
+                                       enerd,
+                                       box,
+                                       lambda,
+                                       dipoleData.muStateAB,
+                                       stepWork,
+                                       ddBalanceRegionHandler);
      }
  
      wallcycle_stop(wcycle, WallCycleCounter::Force);
@@ -1963,7 +2012,7 @@ void do_force(FILE*                               fplog,
                           ed,
                           stepWork.doNeighborSearch);
  
-    if (havePPDomainDecomposition(cr) && stepWork.computeForces && stepWork.useGpuFHalo
+    if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
          && domainWork.haveCpuLocalForceWork)
      {
          stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
@@ -1981,7 +2030,7 @@ void do_force(FILE*                               fplog,
          auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
  
          /* wait for non-local forces (or calculate in emulation mode) */
-        if (havePPDomainDecomposition(cr))
+        if (simulationWork.havePpDomainDecomposition)
          {
              if (simulationWork.useGpuNonbonded)
              {
@@ -2004,13 +2053,7 @@ void do_force(FILE*                               fplog,
  
              if (stepWork.useGpuFBufferOps)
              {
-                // TODO: move this into DomainLifetimeWorkload, including the second part of the
-                // condition The bonded and free energy CPU tasks can have non-local force
-                // contributions which are a dependency for the GPU force reduction.
-                bool haveNonLocalForceContribInCpuBuffer =
-                        domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
-
-                if (haveNonLocalForceContribInCpuBuffer)
+                if (domainWork.haveNonLocalForceContribInCpuBuffer)
                  {
                      stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                                AtomLocality::NonLocal);
@@ -2041,20 +2084,15 @@ void do_force(FILE*                               fplog,
      /* Combining the forces for multiple time stepping before the halo exchange, when possible,
       * avoids an extra halo exchange (when DD is used) and post-processing step.
       */
-    const bool combineMtsForcesBeforeHaloExchange =
-            (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-             && (legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0
-             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || useGpuPmeOnThisRank));
-    if (combineMtsForcesBeforeHaloExchange)
-    {
-        const int numAtoms = havePPDomainDecomposition(cr) ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr;
-        combineMtsForces(numAtoms,
+    if (stepWork.combineMtsForcesBeforeHaloExchange)
+    {
+        combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
                           force.unpaddedArrayRef(),
                           forceView->forceMtsCombined(),
                           inputrec.mtsLevels[1].stepFactor);
      }
  
-    if (havePPDomainDecomposition(cr))
+    if (simulationWork.havePpDomainDecomposition)
      {
          /* We are done with the CPU compute.
           * We will now communicate the non-local forces.
@@ -2070,13 +2108,11 @@ void do_force(FILE*                               fplog,
              {
                  // If there exist CPU forces, data from halo exchange should accumulate into these
                  bool accumulateForces = domainWork.haveCpuLocalForceWork;
-                if (!accumulateForces)
-                {
-                    // Force halo exchange will set a subset of local atoms with remote non-local data
-                    // First clear local portion of force array, so that untouched atoms are zero
-                    stateGpu->clearForcesOnGpu(AtomLocality::Local);
-                }
-                communicateGpuHaloForces(*cr, accumulateForces);
+                gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+                gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+                gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+                communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
              }
              else
              {
@@ -2087,12 +2123,12 @@ void do_force(FILE*                               fplog,
  
                  // Without MTS or with MTS at slow steps with uncombined forces we need to
                  // communicate the fast forces
-                if (!fr->useMts || !combineMtsForcesBeforeHaloExchange)
+                if (!simulationWork.useMts || !stepWork.combineMtsForcesBeforeHaloExchange)
                  {
                      dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
                  }
                  // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
-                if (fr->useMts && stepWork.computeSlowForces)
+                if (simulationWork.useMts && stepWork.computeSlowForces)
                  {
                      dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
                  }
@@ -2102,8 +2138,10 @@ void do_force(FILE*                               fplog,
  
      // With both nonbonded and PME offloaded a GPU on the same rank, we use
      // an alternating wait/reduction scheme.
-    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-                             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
+    bool alternateGpuWait =
+            (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank && simulationWork.useGpuNonbonded
+             && !simulationWork.havePpDomainDecomposition && !stepWork.useGpuFBufferOps);
+
      if (alternateGpuWait)
      {
          alternatePmeNbGpuWaitReduce(fr->nbv.get(),
@@ -2116,7 +2154,7 @@ void do_force(FILE*                               fplog,
                                      wcycle);
      }
  
-    if (!alternateGpuWait && useGpuPmeOnThisRank)
+    if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank)
      {
          pme_gpu_wait_and_reduce(fr->pmedata,
                                  stepWork,
@@ -2171,7 +2209,7 @@ void do_force(FILE*                               fplog,
                       enerd,
                       stepWork,
                       InteractionLocality::Local,
-                     DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+                     haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
                       step,
                       nrnb,
                       wcycle);
@@ -2180,8 +2218,8 @@ void do_force(FILE*                               fplog,
  
      // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && simulationWork.useGpuPmePpCommunication)
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useGpuPmePpCommunication
+        && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
@@ -2206,13 +2244,6 @@ void do_force(FILE*                               fplog,
          {
              ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
  
-            // Flag to specify whether the CPU force buffer has contributions to
-            // local atoms. This depends on whether there are CPU-based force tasks
-            // or when DD is active the halo exchange has resulted in contributions
-            // from the non-local part.
-            const bool haveLocalForceContribInCpuBuffer =
-                    (domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr));
-
              // TODO: move these steps as early as possible:
              // - CPU f H2D should be as soon as all CPU-side forces are done
              // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
@@ -2220,16 +2251,9 @@ void do_force(FILE*                               fplog,
              // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
              //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
              //   These should be unified.
-            if (haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
+            if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
              {
-                // Note: AtomLocality::All is used for the non-DD case because, as in this
-                // case copyForcesToGpu() uses a separate stream, it allows overlap of
-                // CPU force H2D with GPU force tasks on all streams including those in the
-                // local stream which would otherwise be implicit dependencies for the
-                // transfer and would not overlap.
-                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
-
-                stateGpu->copyForcesToGpu(forceWithShift, locality);
+                stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
              }
  
              if (stepWork.computeNonbondedForces)
@@ -2244,7 +2268,8 @@ void do_force(FILE*                               fplog,
              // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
              //       they should not be copied in do_md(...) for the output.
              if (!simulationWork.useGpuUpdate
-                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
+                || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
+                || vsite)
              {
                  stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                  stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
@@ -2258,21 +2283,21 @@ void do_force(FILE*                               fplog,
      }
  
      launchGpuEndOfStepTasks(
-            nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork, useGpuPmeOnThisRank, step, wcycle);
+            nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, *runScheduleWork, step, wcycle);
  
-    if (DOMAINDECOMP(cr))
+    if (haveDDAtomOrdering(*cr))
      {
          dd_force_flop_stop(cr->dd, nrnb);
      }
  
-    const bool haveCombinedMtsForces = (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
-                                        && combineMtsForcesBeforeHaloExchange);
+    const bool haveCombinedMtsForces = (stepWork.computeForces && simulationWork.useMts && stepWork.computeSlowForces
+                                        && stepWork.combineMtsForcesBeforeHaloExchange);
      if (stepWork.computeForces)
      {
          postProcessForceWithShiftForces(
                  nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0, vir_force, *mdatoms, *fr, vsite, stepWork);
  
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
+        if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
          {
              postProcessForceWithShiftForces(
                      nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, *mdatoms, *fr, vsite, stepWork);
@@ -2280,7 +2305,7 @@ void do_force(FILE*                               fplog,
      }
  
      // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+    if (PAR(cr) && simulationWork.haveSeparatePmeRank && simulationWork.useCpuPmePpCommunication
          && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
@@ -2305,7 +2330,7 @@ void do_force(FILE*                               fplog,
          postProcessForces(
                  cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined, vir_force, mdatoms, fr, vsite, stepWork);
  
-        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
+        if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
          {
              postProcessForces(
                      cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, mdatoms, fr, vsite, stepWork);