Merge branch 'origin/release-2020' into master

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index f2528d78b43e643e34847d67e543ac819a8dcf02..179e3116e525860395178d1cd60ebde5df26bead 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -53,6 +53,7 @@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
@@ -75,23 +76,27 @@
  #include "gromacs/mdlib/constr.h"
  #include "gromacs/mdlib/enerdata_utils.h"
  #include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdlib/wholemoleculetransform.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
  #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
  #include "gromacs/pbcutil/ishift.h"
-#include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
@@ -144,22 +149,21 @@ static void calc_virial(int                              start,
                          const rvec                       x[],
                          const gmx::ForceWithShiftForces& forceWithShiftForces,
                          tensor                           vir_part,
-                        const t_graph*                   graph,
                          const matrix                     box,
                          t_nrnb*                          nrnb,
                          const t_forcerec*                fr,
-                        int                              ePBC)
+                        PbcType                          pbcType)
  {
      /* The short-range virial from surrounding boxes */
      const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
-    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, ePBC == epbcSCREW, box);
+    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, pbcType == PbcType::Screw, box);
      inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
  
      /* Calculate partial virial, for local atoms only, based on short range.
       * Total virial is computed in global_stat, called from do_md
       */
      const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
-    f_calc_vir(start, start + homenr, x, f, vir_part, graph, box);
+    f_calc_vir(start, start + homenr, x, f, vir_part, box);
      inc_nrnb(nrnb, eNR_VIRIAL, homenr);
  
      if (debug)
@@ -187,11 +191,15 @@ static void pull_potential_wrapper(const t_commrec*               cr,
       * which is why pull_potential is called close to other communication.
       */
      wallcycle_start(wcycle, ewcPULLPOT);
-    set_pbc(&pbc, ir->ePBC, box);
+    set_pbc(&pbc, ir->pbcType, box);
      dvdl = 0;
      enerd->term[F_COM_PULL] += pull_potential(pull_work, mdatoms, &pbc, cr, t, lambda[efptRESTRAINT],
                                                as_rvec_array(x.data()), force, &dvdl);
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+    for (auto& dhdl : enerd->dhdlLambda)
+    {
+        dhdl += dvdl;
+    }
      wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
@@ -222,6 +230,11 @@ static void pme_receive_force_ener(t_forcerec*           fr,
      enerd->dvdl_lin[efptCOUL] += dvdl_q;
      enerd->dvdl_lin[efptVDW] += dvdl_lj;
  
+    for (auto& dhdl : enerd->dhdlLambda)
+    {
+        dhdl += dvdl_q + dvdl_lj;
+    }
+
      if (wcycle)
      {
          dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
@@ -273,7 +286,6 @@ static void post_process_forces(const t_commrec*      cr,
                                  ForceOutputs*         forceOutputs,
                                  tensor                vir_force,
                                  const t_mdatoms*      mdatoms,
-                                const t_graph*        graph,
                                  const t_forcerec*     fr,
                                  const gmx_vsite_t*    vsite,
                                  const StepWorkload&   stepWork)
@@ -293,7 +305,7 @@ static void post_process_forces(const t_commrec*      cr,
               */
              matrix virial = { { 0 } };
              spread_vsite_f(vsite, x, fDirectVir, nullptr, stepWork.computeVirial, virial, nrnb,
-                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
+                           top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
              forceWithVirial.addVirialContribution(virial);
          }
  
@@ -568,7 +580,7 @@ static void computeSpecialForces(FILE*                          fplog,
          if (awh)
          {
              enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
-                    inputrec->ePBC, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
+                    inputrec->pbcType, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
          }
      }
  
@@ -599,36 +611,21 @@ static void computeSpecialForces(FILE*                          fplog,
      }
  }
  
-/*! \brief Makes PME flags from StepWorkload data.
- *
- * \param[in]  stepWork     Step schedule flags
- * \returns                 PME flags
- */
-static int makePmeFlags(const StepWorkload& stepWork)
-{
-    return GMX_PME_SPREAD | GMX_PME_SOLVE | (stepWork.computeVirial ? GMX_PME_CALC_ENER_VIR : 0)
-           | (stepWork.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0)
-           | (stepWork.computeForces ? GMX_PME_CALC_F : 0);
-}
-
  /*! \brief Launch the prepare_step and spread stages of PME GPU.
   *
   * \param[in]  pmedata              The PME structure
   * \param[in]  box                  The box matrix
   * \param[in]  stepWork             Step schedule flags
- * \param[in]  pmeFlags             PME flags
   * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in
   * the device memory. \param[in]  wcycle               The wallcycle structure
   */
  static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                        const matrix          box,
                                        const StepWorkload&   stepWork,
-                                      int                   pmeFlags,
                                        GpuEventSynchronizer* xReadyOnDevice,
                                        gmx_wallcycle_t       wcycle)
  {
-    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags,
-                                stepWork.useGpuPmeFReduction);
+    pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
      pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
  }
  
@@ -638,11 +635,12 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
   *
   * \param[in]  pmedata        The PME structure
   * \param[in]  wcycle         The wallcycle structure
+ * \param[in]  stepWork       Step schedule flags
   */
-static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle)
+static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork)
  {
-    pme_gpu_launch_complex_transforms(pmedata, wcycle);
-    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+    pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
+    pme_gpu_launch_gather(pmedata, wcycle);
  }
  
  /*! \brief
@@ -659,7 +657,6 @@ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle)
   * \param[in,out] forceOutputs     Output buffer for the forces and virial
   * \param[in,out] enerd            Energy data structure results are reduced into
   * \param[in]     stepWork         Step schedule flags
- * \param[in]     pmeFlags         PME flags
   * \param[in]     wcycle           The wallcycle structure
   */
  static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
@@ -667,7 +664,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
                                          gmx::ForceOutputs*  forceOutputs,
                                          gmx_enerdata_t*     enerd,
                                          const StepWorkload& stepWork,
-                                        int                 pmeFlags,
                                          gmx_wallcycle_t     wcycle)
  {
      bool isPmeGpuDone = false;
@@ -685,7 +681,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
          {
              GpuTaskCompletion completionType =
                      (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, &forceWithVirial,
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
                                                     enerd, completionType);
          }
  
@@ -770,13 +766,13 @@ static ForceOutputs setupForceOutputs(t_forcerec*                         fr,
  
  /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&         inputrec,
-                                                          const t_forcerec&         fr,
-                                                          const pull_t*             pull_work,
-                                                          const gmx_edsam*          ed,
-                                                          const t_idef&             idef,
-                                                          const t_fcdata&           fcd,
-                                                          const t_mdatoms&          mdatoms,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&             inputrec,
+                                                          const t_forcerec&             fr,
+                                                          const pull_t*                 pull_work,
+                                                          const gmx_edsam*              ed,
+                                                          const InteractionDefinitions& idef,
+                                                          const t_fcdata&               fcd,
+                                                          const t_mdatoms&              mdatoms,
                                                            const SimulationWorkload& simulationWork,
                                                            const StepWorkload&       stepWork)
  {
@@ -843,7 +839,7 @@ static StepWorkload setupStepWorkload(const int                 legacyFlags,
  
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
- * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
+ * TODO: eliminate \p useGpuPmeOnThisRank when this is
   * incorporated in DomainLifetimeWorkload.
   */
  static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
@@ -851,12 +847,11 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                      gmx_pme_t*                        pmedata,
                                      gmx_enerdata_t*                   enerd,
                                      const gmx::MdrunScheduleWorkload& runScheduleWork,
-                                    bool                              useGpuNonbonded,
-                                    bool                              useGpuPme,
+                                    bool                              useGpuPmeOnThisRank,
                                      int64_t                           step,
                                      gmx_wallcycle_t                   wcycle)
  {
-    if (useGpuNonbonded)
+    if (runScheduleWork.simulationWork.useGpuNonbonded)
      {
          /* Launch pruning before buffer clearing because the API overhead of the
           * clear kernel launches can leave the GPU idle while it could be running
@@ -875,7 +870,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
-    if (useGpuPme)
+    if (useGpuPmeOnThisRank)
      {
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
@@ -891,6 +886,49 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
      }
  }
  
+//! \brief Data structure to hold dipole-related data and staging arrays
+struct DipoleData
+{
+    //! Dipole staging for fast summing over MPI
+    gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
+    //! Dipole staging for states A and B (index 0 and 1 resp.)
+    gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
+};
+
+
+static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
+                                 const t_commrec*              cr,
+                                 const bool                    haveFreeEnergy,
+                                 gmx::ArrayRef<const real>     lambda,
+                                 rvec                          muTotal,
+                                 const DDBalanceRegionHandler& ddBalanceRegionHandler)
+{
+    if (PAR(cr))
+    {
+        gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
+        ddBalanceRegionHandler.reopenRegionCpu();
+    }
+    for (int i = 0; i < 2; i++)
+    {
+        for (int j = 0; j < DIM; j++)
+        {
+            dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
+        }
+    }
+
+    if (!haveFreeEnergy)
+    {
+        copy_rvec(dipoleData->muStateAB[0], muTotal);
+    }
+    else
+    {
+        for (int j = 0; j < DIM; j++)
+        {
+            muTotal[j] = (1.0 - lambda[efptCOUL]) * dipoleData->muStateAB[0][j]
+                         + lambda[efptCOUL] * dipoleData->muStateAB[1][j];
+        }
+    }
+}
  
  void do_force(FILE*                               fplog,
                const t_commrec*                    cr,
@@ -913,29 +951,23 @@ void do_force(FILE*                               fplog,
                gmx_enerdata_t*                     enerd,
                t_fcdata*                           fcd,
                gmx::ArrayRef<real>                 lambda,
-              t_graph*                            graph,
                t_forcerec*                         fr,
                gmx::MdrunScheduleWorkload*         runScheduleWork,
                const gmx_vsite_t*                  vsite,
-              rvec                                mu_tot,
+              rvec                                muTotal,
                double                              t,
                gmx_edsam*                          ed,
                int                                 legacyFlags,
                const DDBalanceRegionHandler&       ddBalanceRegionHandler)
  {
-    int                          i, j;
-    double                       mu[2 * DIM];
+    GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
+               "The size of the force buffer should be at least the number of atoms to compute "
+               "forces for");
+
      nonbonded_verlet_t*          nbv      = fr->nbv.get();
      interaction_const_t*         ic       = fr->ic;
      gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
-    // TODO remove the code below when the legacy flags are not in use anymore
-    /* modify force flag if not doing nonbonded */
-    if (!fr->bNonbonded)
-    {
-        legacyFlags &= ~GMX_FORCE_NONBONDED;
-    }
-
      const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
  
  
@@ -945,15 +977,6 @@ void do_force(FILE*                               fplog,
  
  
      const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
-    const int  pmeFlags            = makePmeFlags(stepWork);
-
-    // Switches on whether to use GPU for position and force buffer operations
-    // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
-    const BufferOpsUseGpu useGpuXBufOps =
-            stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-    // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
-    const BufferOpsUseGpu useGpuFBufOps =
-            stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
@@ -964,24 +987,9 @@ void do_force(FILE*                               fplog,
          ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
      }
  
-    const int start  = 0;
-    const int homenr = mdatoms->homenr;
-
      clear_mat(vir_force);
  
-    if (stepWork.stateChanged)
-    {
-        if (inputrecNeedMutot(inputrec))
-        {
-            /* Calculate total (local) dipole moment in a temporary common array.
-             * This makes it possible to sum them over nodes faster.
-             */
-            calc_mu(start, homenr, x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB,
-                    mdatoms->nChargePerturbed, mu, mu + DIM);
-        }
-    }
-
-    if (fr->ePBC != epbcNONE)
+    if (fr->pbcType != PbcType::No)
      {
          /* Compute shift vectors every step,
           * because of pressure coupling or box deformation!
@@ -995,40 +1003,29 @@ void do_force(FILE*                               fplog,
          const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
          if (calcCGCM)
          {
-            put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr),
+            put_atoms_in_box_omp(fr->pbcType, box, x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
                                   gmx_omp_nthreads_get(emntDefault));
-            inc_nrnb(nrnb, eNR_SHIFTX, homenr);
-        }
-        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
-        {
-            unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
+            inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
          }
      }
  
      nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
  
-#if GMX_MPI
      const bool pmeSendCoordinatesFromGpu =
-            simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+            GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
      const bool reinitGpuPmePpComms =
-            simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
-#endif
+            GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
  
-    const auto localXReadyOnDevice = (stateGpu != nullptr)
+    const auto localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
                                               ? stateGpu->getCoordinatesReadyOnDeviceEvent(
                                                         AtomLocality::Local, simulationWork, stepWork)
                                               : nullptr;
  
-#if GMX_MPI
      // If coordinates are to be sent to PME task from CPU memory, perform that send here.
      // Otherwise the send will occur after H2D coordinate transfer.
-    if (!thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
+    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
      {
-        /* Send particle coordinates to the pme nodes.
-         * Since this is only implemented for domain decomposition
-         * and domain decomposition does not use the graph,
-         * we do not need to worry about shifting.
-         */
+        /* Send particle coordinates to the pme nodes */
          if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
          {
              GMX_RELEASE_ASSERT(false,
@@ -1043,16 +1040,16 @@ void do_force(FILE*                               fplog,
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
      }
-#endif /* GMX_MPI */
  
      // Coordinates on the device are needed if PME or BufferOps are offloaded.
      // The local coordinates can be copied right away.
      // NOTE: Consider moving this copy to right after they are updated and constrained,
      //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || useGpuXBufOps == BufferOpsUseGpu::True)
+    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
      {
          if (stepWork.doNeighborSearch)
          {
+            // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
                               cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
              if (useGpuPmeOnThisRank)
@@ -1076,13 +1073,11 @@ void do_force(FILE*                               fplog,
      // The conditions for gpuHaloExchange e.g. using GPU buffer
      // operations were checked before construction, so here we can
      // just use it and assert upon any conditions.
-    gmx::GpuHaloExchange* gpuHaloExchange =
-            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
-    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+    const bool ddUsesGpuDirectCommunication =
+            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                 "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange =
-            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
  
      // Copy coordinate from the GPU if update is on the GPU and there
      // are forces to be computed on the CPU, or for the computation of
@@ -1104,35 +1099,28 @@ void do_force(FILE*                               fplog,
          haveCopiedXFromGpu = true;
      }
  
-#if GMX_MPI
      // If coordinates are to be sent to PME task from GPU memory, perform that send here.
      // Otherwise the send will occur before the H2D coordinate transfer.
      if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
      {
-        /* Send particle coordinates to the pme nodes.
-         * Since this is only implemented for domain decomposition
-         * and domain decomposition does not use the graph,
-         * we do not need to worry about shifting.
-         */
+        /* Send particle coordinates to the pme nodes */
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
                                   lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
      }
-#endif /* GMX_MPI */
  
      if (useGpuPmeOnThisRank)
      {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, localXReadyOnDevice, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle);
      }
  
      /* do gridding for pair search */
      if (stepWork.doNeighborSearch)
      {
-        if (graph && stepWork.stateChanged)
+        if (fr->wholeMoleculeTransform && stepWork.stateChanged)
          {
-            /* Calculate intramolecular shift vectors to make molecules whole */
-            mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
+            fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
          }
  
          // TODO
@@ -1191,10 +1179,7 @@ void do_force(FILE*                               fplog,
              }
              wallcycle_stop(wcycle, ewcLAUNCH_GPU);
          }
-    }
  
-    if (stepWork.doNeighborSearch)
-    {
          // Need to run after the GPU-offload bonded interaction lists
          // are set up to be able to determine whether there is bonded work.
          runScheduleWork->domainWork = setupDomainLifetimeWorkload(
@@ -1203,19 +1188,19 @@ void do_force(FILE*                               fplog,
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-        nbv->constructPairlist(InteractionLocality::Local, &top->excls, step, nrnb);
+        nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
  
          nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
  
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
          wallcycle_stop(wcycle, ewcNS);
  
-        if (useGpuXBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuXBufferOps)
          {
              nbv->atomdata_init_copy_x_to_nbat_x_gpu();
          }
          // For force buffer ops, we use the below conditon rather than
-        // useGpuFBufOps to ensure that init is performed even if this
+        // useGpuFBufferOps to ensure that init is performed even if this
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
          {
@@ -1225,7 +1210,7 @@ void do_force(FILE*                               fplog,
      }
      else if (!EI_TPI(inputrec->eI))
      {
-        if (useGpuXBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuXBufferOps)
          {
              GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
              nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
@@ -1254,7 +1239,7 @@ void do_force(FILE*                               fplog,
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+        if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
          {
              Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
          }
@@ -1266,7 +1251,7 @@ void do_force(FILE*                               fplog,
          if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
          {
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-            fr->gpuBonded->launchKernel(fr, stepWork, box);
+            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
          }
  
@@ -1283,7 +1268,7 @@ void do_force(FILE*                               fplog,
          // X copy/transform to allow overlap as well as after the GPU NB
          // launch to avoid FFT launch overhead hijacking the CPU and delaying
          // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork);
      }
  
      /* Communicate coordinates and sum dipole if necessary +
@@ -1296,14 +1281,18 @@ void do_force(FILE*                               fplog,
              wallcycle_start_nocount(wcycle, ewcNS);
              wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
-            nbv->constructPairlist(InteractionLocality::NonLocal, &top->excls, step, nrnb);
+            nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
  
              nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
+            // TODO refactor this GPU halo exchange re-initialisation
+            // to location in do_md where GPU halo exchange is
+            // constructed at partitioning, after above stateGpu
+            // re-initialization has similarly been refactored
              if (ddUsesGpuDirectCommunication)
              {
-                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
+                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
              }
          }
          else
@@ -1312,7 +1301,7 @@ void do_force(FILE*                               fplog,
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
-                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
+                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
@@ -1329,9 +1318,8 @@ void do_force(FILE*                               fplog,
                  dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
              }
  
-            if (useGpuXBufOps == BufferOpsUseGpu::True)
+            if (stepWork.useGpuXBufferOps)
              {
-                // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
                  if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
@@ -1350,7 +1338,7 @@ void do_force(FILE*                               fplog,
          {
              wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
-            if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+            if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                  Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
@@ -1360,7 +1348,7 @@ void do_force(FILE*                               fplog,
              if (domainWork.haveGpuBondedWork)
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-                fr->gpuBonded->launchKernel(fr, stepWork, box);
+                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
              }
  
@@ -1394,33 +1382,27 @@ void do_force(FILE*                               fplog,
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
-    if (stepWork.stateChanged && inputrecNeedMutot(inputrec))
+    gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
+    if (fr->wholeMoleculeTransform)
      {
-        if (PAR(cr))
-        {
-            gmx_sumd(2 * DIM, mu, cr);
+        xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
+    }
  
-            ddBalanceRegionHandler.reopenRegionCpu();
-        }
+    DipoleData dipoleData;
  
-        for (i = 0; i < 2; i++)
-        {
-            for (j = 0; j < DIM; j++)
-            {
-                fr->mu_tot[i][j] = mu[i * DIM + j];
-            }
-        }
-    }
-    if (mdatoms->nChargePerturbed == 0)
-    {
-        copy_rvec(fr->mu_tot[0], mu_tot);
-    }
-    else
+    if (simulationWork.computeMuTot)
      {
-        for (j = 0; j < DIM; j++)
-        {
-            mu_tot[j] = (1.0 - lambda[efptCOUL]) * fr->mu_tot[0][j] + lambda[efptCOUL] * fr->mu_tot[1][j];
-        }
+        const int start = 0;
+
+        /* Calculate total (local) dipole moment in a temporary common array.
+         * This makes it possible to sum them over nodes faster.
+         */
+        gmx::ArrayRef<const gmx::RVec> xRef =
+                (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
+        calc_mu(start, mdatoms->homenr, xRef, mdatoms->chargeA, mdatoms->chargeB,
+                mdatoms->nChargePerturbed, dipoleData.muStaging[0], dipoleData.muStaging[1]);
+
+        reduceAndUpdateMuTot(&dipoleData, cr, (fr->efep != efepNO), lambda, muTotal, ddBalanceRegionHandler);
      }
  
      /* Reset energies */
@@ -1527,12 +1509,6 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    /* update QMMMrec, if necessary */
-    if (fr->bQMMM)
-    {
-        update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
-    }
-
      // TODO Force flags should include haveFreeEnergyWork for this domain
      if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
      {
@@ -1540,8 +1516,9 @@ void do_force(FILE*                               fplog,
          stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
      }
      /* Compute the bonded and non-bonded energies and optionally forces */
-    do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
-                      fcd, box, lambda.data(), graph, fr->mu_tot, stepWork, ddBalanceRegionHandler);
+    do_force_lowlevel(fr, inputrec, top->idef, cr, ms, nrnb, wcycle, mdatoms, x, xWholeMolecules,
+                      hist, &forceOut, enerd, fcd, box, lambda.data(),
+                      as_rvec_array(dipoleData.muStateAB), stepWork, ddBalanceRegionHandler);
  
      wallcycle_stop(wcycle, ewcFORCE);
  
@@ -1574,7 +1551,7 @@ void do_force(FILE*                               fplog,
                  wallcycle_stop(wcycle, ewcFORCE);
              }
  
-            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            if (stepWork.useGpuFBufferOps)
              {
                  gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
  
@@ -1589,7 +1566,7 @@ void do_force(FILE*                               fplog,
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
                                                AtomLocality::NonLocal);
                      dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                            AtomLocality::NonLocal, useGpuFBufOps == BufferOpsUseGpu::True));
+                            AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
                  }
  
                  nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
@@ -1633,11 +1610,11 @@ void do_force(FILE*                               fplog,
                  {
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                  }
-                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
+                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
              }
              else
              {
-                if (useGpuFBufOps == BufferOpsUseGpu::True)
+                if (stepWork.useGpuFBufferOps)
                  {
                      stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
                  }
@@ -1649,15 +1626,15 @@ void do_force(FILE*                               fplog,
      // With both nonbonded and PME offloaded a GPU on the same rank, we use
      // an alternating wait/reduction scheme.
      bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-                             && !DOMAINDECOMP(cr) && (useGpuFBufOps == BufferOpsUseGpu::False));
+                             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
      if (alternateGpuWait)
      {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, pmeFlags, wcycle);
+        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle);
      }
  
      if (!alternateGpuWait && useGpuPmeOnThisRank)
      {
-        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
+        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd);
      }
  
      /* Wait for local GPU NB outputs on the non-alternating wait path */
@@ -1743,7 +1720,7 @@ void do_force(FILE*                               fplog,
  
          gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
  
-        if (useGpuFBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuFBufferOps)
          {
              // Flag to specify whether the CPU force buffer has contributions to
              // local atoms. This depends on whether there are CPU-based force tasks
@@ -1769,12 +1746,12 @@ void do_force(FILE*                               fplog,
                  auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
  
                  stateGpu->copyForcesToGpu(forceWithShift, locality);
-                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                        locality, useGpuFBufOps == BufferOpsUseGpu::True));
+                dependencyList.push_back(
+                        stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
              }
              if (useGpuForcesHaloExchange)
              {
-                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
+                dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
@@ -1798,7 +1775,7 @@ void do_force(FILE*                               fplog,
      }
  
      launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
-                            simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
+                            useGpuPmeOnThisRank, step, wcycle);
  
      if (DOMAINDECOMP(cr))
      {
@@ -1816,14 +1793,14 @@ void do_force(FILE*                               fplog,
          {
              rvec* fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
              spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE,
-                           nullptr, nrnb, &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
+                           nullptr, nrnb, top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
          }
  
          if (stepWork.computeVirial)
          {
              /* Calculation of the virial must be done after vsites! */
              calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
-                        forceOut.forceWithShiftForces(), vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+                        forceOut.forceWithShiftForces(), vir_force, box, nrnb, fr, inputrec->pbcType);
          }
      }
  
@@ -1841,7 +1818,7 @@ void do_force(FILE*                               fplog,
      if (stepWork.computeForces)
      {
          post_process_forces(cr, step, nrnb, wcycle, top, box, as_rvec_array(x.unpaddedArrayRef().data()),
-                            &forceOut, vir_force, mdatoms, graph, fr, vsite, stepWork);
+                            &forceOut, vir_force, mdatoms, fr, vsite, stepWork);
      }
  
      if (stepWork.computeEnergy)