Merge branch release-2016 into release-2018
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
index 7c71829401c803004a47432df9b579b7c351d196..ee757e41813383d99063be53eb3a16f466be1a9e 100644 (file)
 
 #include "config.h"
 
-#include <assert.h>
-#include <math.h>
 #include <stdio.h>
 #include <string.h>
 
+#include <cmath>
+#include <cstdint>
+
 #include <array>
 
+#include "gromacs/awh/awh.h"
+#include "gromacs/domdec/dlbtiming.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
 #include "gromacs/gmxlib/chargegroup.h"
-#include "gromacs/gmxlib/md_logging.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
 #include "gromacs/listed-forces/bonded.h"
 #include "gromacs/listed-forces/disre.h"
 #include "gromacs/mdlib/qmmm.h"
 #include "gromacs/mdlib/update.h"
 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
-#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
-#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/iforceprovider.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/timing/wallcyclereporting.h"
 #include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/logger.h"
 #include "gromacs/utility/pleasecite.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/sysinfo.h"
 
 #include "nbnxn_gpu.h"
+#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
+#include "nbnxn_kernels/nbnxn_kernel_prune.h"
+
+// TODO: this environment variable allows us to verify before release
+// that on less common architectures the total cost of polling is not larger than
+// a blocking wait (so polling does not introduce overhead when the static
+// PME-first ordering would suffice).
+static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+
 
 void print_time(FILE                     *out,
                 gmx_walltime_accounting_t walltime_accounting,
@@ -204,90 +218,32 @@ void print_start(FILE *fplog, t_commrec *cr,
                         walltime_accounting_get_start_time_stamp(walltime_accounting));
 }
 
-static void sum_forces(int start, int end, rvec f[], rvec flr[])
+static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
-    int i;
+    const int end = forceToAdd.size();
 
-    if (gmx_debug_at)
-    {
-        pr_rvecs(debug, 0, "fsr", f+start, end-start);
-        pr_rvecs(debug, 0, "flr", flr+start, end-start);
-    }
     // cppcheck-suppress unreadVariable
     int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
 #pragma omp parallel for num_threads(nt) schedule(static)
-    for (i = start; i < end; i++)
+    for (int i = 0; i < end; i++)
     {
-        rvec_inc(f[i], flr[i]);
+        rvec_inc(f[i], forceToAdd[i]);
     }
 }
 
-/*
- * calc_f_el calculates forces due to an electric field.
- *
- * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
- *
- * Et[] contains the parameters for the time dependent
- * part of the field.
- * Ex[] contains the parameters for
- * the spatial dependent part of the field.
- * The function should return the energy due to the electric field
- * (if any) but for now returns 0.
- *
- * WARNING:
- * There can be problems with the virial.
- * Since the field is not self-consistent this is unavoidable.
- * For neutral molecules the virial is correct within this approximation.
- * For neutral systems with many charged molecules the error is small.
- * But for systems with a net charge or a few charged molecules
- * the error can be significant when the field is high.
- * Solution: implement a self-consistent electric field into PME.
- */
-static void calc_f_el(FILE *fp, int  start, int homenr,
-                      real charge[], rvec f[],
-                      t_cosines Ex[], t_cosines Et[], double t)
+static void pme_gpu_reduce_outputs(gmx_wallcycle_t            wcycle,
+                                   ForceWithVirial           *forceWithVirial,
+                                   ArrayRef<const gmx::RVec>  pmeForces,
+                                   gmx_enerdata_t            *enerd,
+                                   const tensor               vir_Q,
+                                   real                       Vlr_q)
 {
-    rvec Ext;
-    real t0;
-    int  i, m;
-
-    for (m = 0; (m < DIM); m++)
-    {
-        if (Et[m].n > 0)
-        {
-            if (Et[m].n == 3)
-            {
-                t0     = Et[m].a[1];
-                Ext[m] = std::cos(Et[m].a[0]*(t-t0))*std::exp(-gmx::square(t-t0)/(2.0*gmx::square(Et[m].a[2])));
-            }
-            else
-            {
-                Ext[m] = std::cos(Et[m].a[0]*t);
-            }
-        }
-        else
-        {
-            Ext[m] = 1.0;
-        }
-        if (Ex[m].n > 0)
-        {
-            /* Convert the field strength from V/nm to MD-units */
-            Ext[m] *= Ex[m].a[0]*FIELDFAC;
-            for (i = start; (i < start+homenr); i++)
-            {
-                f[i][m] += charge[i]*Ext[m];
-            }
-        }
-        else
-        {
-            Ext[m] = 0;
-        }
-    }
-    if (fp != NULL)
-    {
-        fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
-                Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
-    }
+    wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
+    GMX_ASSERT(forceWithVirial, "Invalid force pointer");
+    forceWithVirial->addVirialContribution(vir_Q);
+    enerd->term[F_COUL_RECIP] += Vlr_q;
+    sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
+    wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 }
 
 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
@@ -297,7 +253,6 @@ static void calc_virial(int start, int homenr, rvec x[], rvec f[],
     int    i;
 
     /* The short-range virial from surrounding boxes */
-    clear_mat(vir_part);
     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 
@@ -307,12 +262,6 @@ static void calc_virial(int start, int homenr, rvec x[], rvec f[],
     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 
-    /* Add position restraint contribution */
-    for (i = 0; i < DIM; i++)
-    {
-        vir_part[i][i] += fr->vir_diag_posres[i];
-    }
-
     /* Add wall contribution */
     for (i = 0; i < DIM; i++)
     {
@@ -328,8 +277,7 @@ static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 static void pull_potential_wrapper(t_commrec *cr,
                                    t_inputrec *ir,
                                    matrix box, rvec x[],
-                                   rvec f[],
-                                   tensor vir_force,
+                                   ForceWithVirial *force,
                                    t_mdatoms *mdatoms,
                                    gmx_enerdata_t *enerd,
                                    real *lambda,
@@ -341,23 +289,21 @@ static void pull_potential_wrapper(t_commrec *cr,
 
     /* Calculate the center of mass forces, this requires communication,
      * which is why pull_potential is called close to other communication.
-     * The virial contribution is calculated directly,
-     * which is why we call pull_potential after calc_virial.
      */
     wallcycle_start(wcycle, ewcPULLPOT);
     set_pbc(&pbc, ir->ePBC, box);
     dvdl                     = 0;
     enerd->term[F_COM_PULL] +=
         pull_potential(ir->pull_work, mdatoms, &pbc,
-                       cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+                       cr, t, lambda[efptRESTRAINT], x, force, &dvdl);
     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
     wallcycle_stop(wcycle, ewcPULLPOT);
 }
 
-static void pme_receive_force_ener(t_commrec      *cr,
-                                   gmx_wallcycle_t wcycle,
-                                   gmx_enerdata_t *enerd,
-                                   t_forcerec     *fr)
+static void pme_receive_force_ener(t_commrec       *cr,
+                                   ForceWithVirial *forceWithVirial,
+                                   gmx_enerdata_t  *enerd,
+                                   gmx_wallcycle_t  wcycle)
 {
     real   e_q, e_lj, dvdl_q, dvdl_lj;
     float  cycles_ppdpme, cycles_seppme;
@@ -371,8 +317,7 @@ static void pme_receive_force_ener(t_commrec      *cr,
     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
     dvdl_q  = 0;
     dvdl_lj = 0;
-    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
-                      fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
+    gmx_pme_receive_f(cr, forceWithVirial, &e_q, &e_lj, &dvdl_q, &dvdl_lj,
                       &cycles_seppme);
     enerd->term[F_COUL_RECIP] += e_q;
     enerd->term[F_LJ_RECIP]   += e_lj;
@@ -387,24 +332,34 @@ static void pme_receive_force_ener(t_commrec      *cr,
 }
 
 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
-                               gmx_int64_t step, real pforce, rvec *x, rvec *f)
+                               gmx_int64_t step, real forceTolerance,
+                               const rvec *x, const rvec *f)
 {
-    int  i;
-    real pf2, fn2;
-    char buf[STEPSTRSIZE];
-
-    pf2 = gmx::square(pforce);
-    for (i = 0; i < md->homenr; i++)
+    real           force2Tolerance = gmx::square(forceTolerance);
+    std::uintmax_t numNonFinite    = 0;
+    for (int i = 0; i < md->homenr; i++)
     {
-        fn2 = norm2(f[i]);
-        /* We also catch NAN, if the compiler does not optimize this away. */
-        if (fn2 >= pf2 || fn2 != fn2)
+        real force2    = norm2(f[i]);
+        bool nonFinite = !std::isfinite(force2);
+        if (force2 >= force2Tolerance || nonFinite)
+        {
+            fprintf(fp, "step %" GMX_PRId64 " atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
+                    step,
+                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
+        }
+        if (nonFinite)
         {
-            fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
-                    gmx_step_str(step, buf),
-                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(fn2));
+            numNonFinite++;
         }
     }
+    if (numNonFinite > 0)
+    {
+        /* Note that with MPI this fatal call on one rank might interrupt
+         * the printing on other ranks. But we can only avoid that with
+         * an expensive MPI barrier that we would need at each step.
+         */
+        gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
+    }
 }
 
 static void post_process_forces(t_commrec *cr,
@@ -413,14 +368,17 @@ static void post_process_forces(t_commrec *cr,
                                 gmx_localtop_t *top,
                                 matrix box, rvec x[],
                                 rvec f[],
+                                ForceWithVirial *forceWithVirial,
                                 tensor vir_force,
                                 t_mdatoms *mdatoms,
                                 t_graph *graph,
                                 t_forcerec *fr, gmx_vsite_t *vsite,
                                 int flags)
 {
-    if (fr->bF_NoVirSum)
+    if (fr->haveDirectVirialContributions)
     {
+        rvec *fDirectVir = as_rvec_array(forceWithVirial->force_.data());
+
         if (vsite)
         {
             /* Spread the mesh force on virtual sites to the other particles...
@@ -428,34 +386,24 @@ static void post_process_forces(t_commrec *cr,
              * if the constructing atoms aren't local.
              */
             wallcycle_start(wcycle, ewcVSITESPREAD);
-            spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
-                           (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+            matrix virial = { { 0 } };
+            spread_vsite_f(vsite, x, fDirectVir, nullptr,
+                           (flags & GMX_FORCE_VIRIAL), virial,
                            nrnb,
                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+            forceWithVirial->addVirialContribution(virial);
             wallcycle_stop(wcycle, ewcVSITESPREAD);
         }
+
         if (flags & GMX_FORCE_VIRIAL)
         {
             /* Now add the forces, this is local */
-            if (fr->bDomDec)
-            {
-                sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
-            }
-            else
-            {
-                sum_forces(0, mdatoms->homenr,
-                           f, fr->f_novirsum);
-            }
-            if (EEL_FULL(fr->eeltype))
-            {
-                /* Add the mesh contribution to the virial */
-                m_add(vir_force, fr->vir_el_recip, vir_force);
-            }
-            if (EVDW_PME(fr->vdwtype))
-            {
-                /* Add the mesh contribution to the virial */
-                m_add(vir_force, fr->vir_lj_recip, vir_force);
-            }
+            sum_forces(f, forceWithVirial->force_);
+
+            /* Add the direct virial contributions */
+            GMX_ASSERT(forceWithVirial->computeVirial_, "forceWithVirial should request virial computation when we request the virial");
+            m_add(vir_force, forceWithVirial->getVirial(), vir_force);
+
             if (debug)
             {
                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
@@ -474,20 +422,18 @@ static void do_nb_verlet(t_forcerec *fr,
                          gmx_enerdata_t *enerd,
                          int flags, int ilocality,
                          int clearF,
+                         gmx_int64_t step,
                          t_nrnb *nrnb,
                          gmx_wallcycle_t wcycle)
 {
-    int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
-    nonbonded_verlet_group_t  *nbvg;
-    gmx_bool                   bUsingGpuKernels;
-
     if (!(flags & GMX_FORCE_NONBONDED))
     {
         /* skip non-bonded calculation */
         return;
     }
 
-    nbvg = &fr->nbv->grp[ilocality];
+    nonbonded_verlet_t       *nbv  = fr->nbv;
+    nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 
     /* GPU kernel launch overhead is already timed separately */
     if (fr->cutoff_scheme != ecutsVERLET)
@@ -495,17 +441,35 @@ static void do_nb_verlet(t_forcerec *fr,
         gmx_incons("Invalid cut-off scheme passed!");
     }
 
-    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
+    bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 
     if (!bUsingGpuKernels)
     {
+        /* When dynamic pair-list  pruning is requested, we need to prune
+         * at nstlistPrune steps.
+         */
+        if (nbv->listParams->useDynamicPruning &&
+            (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
+        {
+            /* Prune the pair-list beyond fr->ic->rlistPrune using
+             * the current coordinates of the atoms.
+             */
+            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+            nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
+            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+        }
+
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
+
     switch (nbvg->kernel_type)
     {
         case nbnxnk4x4_PlainC:
-            nbnxn_kernel_ref(&nbvg->nbl_lists,
-                             nbvg->nbat, ic,
+        case nbnxnk4xN_SIMD_4xN:
+        case nbnxnk4xN_SIMD_2xNN:
+            nbnxn_kernel_cpu(nbvg,
+                             nbv->nbat,
+                             ic,
                              fr->shift_vec,
                              flags,
                              clearF,
@@ -516,44 +480,17 @@ static void do_nb_verlet(t_forcerec *fr,
                              enerd->grpp.ener[egLJSR]);
             break;
 
-        case nbnxnk4xN_SIMD_4xN:
-            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
-                                  nbvg->nbat, ic,
-                                  nbvg->ewald_excl,
-                                  fr->shift_vec,
-                                  flags,
-                                  clearF,
-                                  fr->fshift[0],
-                                  enerd->grpp.ener[egCOULSR],
-                                  fr->bBHAM ?
-                                  enerd->grpp.ener[egBHAMSR] :
-                                  enerd->grpp.ener[egLJSR]);
-            break;
-        case nbnxnk4xN_SIMD_2xNN:
-            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
-                                   nbvg->nbat, ic,
-                                   nbvg->ewald_excl,
-                                   fr->shift_vec,
-                                   flags,
-                                   clearF,
-                                   fr->fshift[0],
-                                   enerd->grpp.ener[egCOULSR],
-                                   fr->bBHAM ?
-                                   enerd->grpp.ener[egBHAMSR] :
-                                   enerd->grpp.ener[egLJSR]);
-            break;
-
         case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
             break;
 
         case nbnxnk8x8x8_PlainC:
             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
-                                 nbvg->nbat, ic,
+                                 nbv->nbat, ic,
                                  fr->shift_vec,
                                  flags,
                                  clearF,
-                                 nbvg->nbat->out[0].f,
+                                 nbv->nbat->out[0].f,
                                  fr->fshift[0],
                                  enerd->grpp.ener[egCOULSR],
                                  fr->bBHAM ?
@@ -562,7 +499,7 @@ static void do_nb_verlet(t_forcerec *fr,
             break;
 
         default:
-            gmx_incons("Invalid nonbonded kernel type passed!");
+            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 
     }
     if (!bUsingGpuKernels)
@@ -570,12 +507,13 @@ static void do_nb_verlet(t_forcerec *fr,
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
 
+    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
     {
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
     }
     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
+             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
     {
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
     }
@@ -669,7 +607,7 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
         dvdl_nb[i]  = 0;
     }
 
-    assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
+    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads");
 
     wallcycle_sub_start(wcycle, ewcsNONBONDED);
 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
@@ -733,7 +671,7 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
 
 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 {
-    return nbv != NULL && nbv->bUseGPU;
+    return nbv != nullptr && nbv->bUseGPU;
 }
 
 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
@@ -761,63 +699,400 @@ static gmx_inline void clear_rvecs_omp(int n, rvec v[])
     }
 }
 
-/*! \brief  This routine checks if the potential energy is finite.
+/*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
+ *
+ * \param groupOptions  Group options, containing T-coupling options
+ */
+static real averageKineticEnergyEstimate(const t_grpopts &groupOptions)
+{
+    real nrdfCoupled   = 0;
+    real nrdfUncoupled = 0;
+    real kineticEnergy = 0;
+    for (int g = 0; g < groupOptions.ngtc; g++)
+    {
+        if (groupOptions.tau_t[g] >= 0)
+        {
+            nrdfCoupled   += groupOptions.nrdf[g];
+            kineticEnergy += groupOptions.nrdf[g]*0.5*groupOptions.ref_t[g]*BOLTZ;
+        }
+        else
+        {
+            nrdfUncoupled += groupOptions.nrdf[g];
+        }
+    }
+
+    /* This conditional with > also catches nrdf=0 */
+    if (nrdfCoupled > nrdfUncoupled)
+    {
+        return kineticEnergy*(nrdfCoupled + nrdfUncoupled)/nrdfCoupled;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+/*! \brief This routine checks that the potential energy is finite.
  *
+ * Always checks that the potential energy is finite. If step equals
+ * inputrec.init_step also checks that the magnitude of the potential energy
+ * is reasonable. Terminates with a fatal error when a check fails.
  * Note that passing this check does not guarantee finite forces,
  * since those use slightly different arithmetics. But in most cases
  * there is just a narrow coordinate range where forces are not finite
  * and energies are finite.
  *
- * \param[in] enerd  The energy data; the non-bonded group energies need to be added in here before calling this routine
+ * \param[in] step      The step number, used for checking and printing
+ * \param[in] enerd     The energy data; the non-bonded group energies need to be added to enerd.term[F_EPOT] before calling this routine
+ * \param[in] inputrec  The input record
+ */
+static void checkPotentialEnergyValidity(gmx_int64_t           step,
+                                         const gmx_enerdata_t &enerd,
+                                         const t_inputrec     &inputrec)
+{
+    /* Threshold valid for comparing absolute potential energy against
+     * the kinetic energy. Normally one should not consider absolute
+     * potential energy values, but with a factor of one million
+     * we should never get false positives.
+     */
+    constexpr real c_thresholdFactor = 1e6;
+
+    bool           energyIsNotFinite    = !std::isfinite(enerd.term[F_EPOT]);
+    real           averageKineticEnergy = 0;
+    /* We only check for large potential energy at the initial step,
+     * because that is by far the most likely step for this too occur
+     * and because computing the average kinetic energy is not free.
+     * Note: nstcalcenergy >> 1 often does not allow to catch large energies
+     * before they become NaN.
+     */
+    if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
+    {
+        averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
+    }
+
+    if (energyIsNotFinite || (averageKineticEnergy > 0 &&
+                              enerd.term[F_EPOT] > c_thresholdFactor*averageKineticEnergy))
+    {
+        gmx_fatal(FARGS, "Step %" GMX_PRId64 ": The total potential energy is %g, which is %s. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A %s potential energy can be caused by overlapping interactions in bonded interactions or very large%s coordinate values. Usually this is caused by a badly- or non-equilibrated initial configuration, incorrect interactions or parameters in the topology.",
+                  step,
+                  enerd.term[F_EPOT],
+                  energyIsNotFinite ? "not finite" : "extremely high",
+                  enerd.term[F_LJ],
+                  enerd.term[F_COUL_SR],
+                  energyIsNotFinite ? "non-finite" : "very high",
+                  energyIsNotFinite ? " or Nan" : "");
+    }
+}
+
+/*! \brief Compute forces and/or energies for special algorithms
+ *
+ * The intention is to collect all calls to algorithms that compute
+ * forces on local atoms only and that do not contribute to the local
+ * virial sum (but add their virial contribution separately).
+ * Eventually these should likely all become ForceProviders.
+ * Within this function the intention is to have algorithms that do
+ * global communication at the end, so global barriers within the MD loop
+ * are as close together as possible.
+ *
+ * \param[in]     fplog            The log file
+ * \param[in]     cr               The communication record
+ * \param[in]     inputrec         The input record
+ * \param[in]     step             The current MD step
+ * \param[in]     t                The current time
+ * \param[in,out] wcycle           Wallcycle accounting struct
+ * \param[in,out] forceProviders   Pointer to a list of force providers
+ * \param[in]     box              The unit cell
+ * \param[in]     x                The coordinates
+ * \param[in]     mdatoms          Per atom properties
+ * \param[in]     lambda           Array of free-energy lambda values
+ * \param[in]     forceFlags       Flags that tell whether we should compute forces/energies/virial
+ * \param[in,out] forceWithVirial  Force and virial buffers
+ * \param[in,out] enerd            Energy buffer
+ * \param[in,out] ed               Essential dynamics pointer
+ * \param[in]     bNS              Tells if we did neighbor searching this step, used for ED sampling
+ *
+ * \todo Remove bNS, which is used incorrectly.
+ * \todo Convert all other algorithms called here to ForceProviders.
+ */
+static void
+computeSpecialForces(FILE             *fplog,
+                     t_commrec        *cr,
+                     t_inputrec       *inputrec,
+                     gmx_int64_t       step,
+                     double            t,
+                     gmx_wallcycle_t   wcycle,
+                     ForceProviders   *forceProviders,
+                     matrix            box,
+                     rvec             *x,
+                     t_mdatoms        *mdatoms,
+                     real             *lambda,
+                     int               forceFlags,
+                     ForceWithVirial  *forceWithVirial,
+                     gmx_enerdata_t   *enerd,
+                     gmx_edsam_t       ed,
+                     gmx_bool          bNS)
+{
+    const bool computeForces = (forceFlags & GMX_FORCE_FORCES);
+
+    /* NOTE: Currently all ForceProviders only provide forces.
+     *       When they also provide energies, remove this conditional.
+     */
+    if (computeForces)
+    {
+        /* Collect forces from modules */
+        forceProviders->calculateForces(cr, mdatoms, box, t, x, forceWithVirial);
+    }
+
+    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
+    {
+        pull_potential_wrapper(cr, inputrec, box, x,
+                               forceWithVirial,
+                               mdatoms, enerd, lambda, t,
+                               wcycle);
+
+        if (inputrec->bDoAwh)
+        {
+            Awh &awh = *inputrec->awh;
+            enerd->term[F_COM_PULL] +=
+                awh.applyBiasForcesAndUpdateBias(inputrec->ePBC, *mdatoms, box,
+                                                 forceWithVirial,
+                                                 t, step, wcycle, fplog);
+        }
+    }
+
+    rvec *f = as_rvec_array(forceWithVirial->force_.data());
+
+    /* Add the forces from enforced rotation potentials (if any) */
+    if (inputrec->bRot)
+    {
+        wallcycle_start(wcycle, ewcROTadd);
+        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+        wallcycle_stop(wcycle, ewcROTadd);
+    }
+
+    if (ed)
+    {
+        /* Note that since init_edsam() is called after the initialization
+         * of forcerec, edsam doesn't request the noVirSum force buffer.
+         * Thus if no other algorithm (e.g. PME) requires it, the forces
+         * here will contribute to the virial.
+         */
+        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+    }
+
+    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+    if (inputrec->bIMD && computeForces)
+    {
+        IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+    }
+}
+
+/*! \brief Launch the prepare_step and spread stages of PME GPU.
+ *
+ * \param[in]  pmedata       The PME structure
+ * \param[in]  box           The box matrix
+ * \param[in]  x             Coordinate array
+ * \param[in]  flags         Force flags
+ * \param[in]  wcycle        The wallcycle structure
+ */
+static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
+                                      matrix          box,
+                                      rvec            x[],
+                                      int             flags,
+                                      gmx_wallcycle_t wcycle)
+{
+    int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+    pmeFlags |= (flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0;
+    pmeFlags |= (flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0;
+
+    pme_gpu_prepare_computation(pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pmeFlags);
+    pme_gpu_launch_spread(pmedata, x, wcycle);
+}
+
+/*! \brief Launch the FFT and gather stages of PME GPU
+ *
+ * This function only implements setting the output forces (no accumulation).
+ *
+ * \param[in]  pmedata        The PME structure
+ * \param[in]  wcycle         The wallcycle structure
  */
-static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
+static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
+                                     gmx_wallcycle_t   wcycle)
 {
-    if (!std::isfinite(enerd->term[F_EPOT]))
+    pme_gpu_launch_complex_transforms(pmedata, wcycle);
+    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+}
+
+/*! \brief
+ *  Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in]     nbv              Nonbonded verlet structure
+ * \param[in]     pmedata          PME module data
+ * \param[in,out] force            Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial  Force and virial buffers
+ * \param[in,out] fshift           Shift force output vector results are reduced into
+ * \param[in,out] enerd            Energy data structure results are reduced into
+ * \param[in]     flags            Force flags
+ * \param[in]     wcycle           The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
+                                        const gmx_pme_t                *pmedata,
+                                        gmx::PaddedArrayRef<gmx::RVec> *force,
+                                        ForceWithVirial                *forceWithVirial,
+                                        rvec                            fshift[],
+                                        gmx_enerdata_t                 *enerd,
+                                        int                             flags,
+                                        gmx_wallcycle_t                 wcycle)
+{
+    bool isPmeGpuDone = false;
+    bool isNbGpuDone  = false;
+
+
+    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+    while (!isPmeGpuDone || !isNbGpuDone)
     {
-        gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
-                  enerd->term[F_EPOT],
-                  enerd->term[F_LJ],
-                  enerd->term[F_COUL_SR]);
+        if (!isPmeGpuDone)
+        {
+            matrix            vir_Q;
+            real              Vlr_q;
+
+            GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+                                                   vir_Q, &Vlr_q, completionType);
+
+            if (isPmeGpuDone)
+            {
+                pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+                                       enerd, vir_Q, Vlr_q);
+            }
+        }
+
+        if (!isNbGpuDone)
+        {
+            GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+                                                    flags, eatLocal,
+                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                    fshift, completionType);
+            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            // To get the call count right, when the task finished we
+            // issue a start/stop.
+            // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+            // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+            if (isNbGpuDone)
+            {
+                wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+                wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+                wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+                                               nbv->nbat, as_rvec_array(force->data()));
+                wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+                wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+            }
+        }
+    }
+}
+
+/*! \brief
+ *  Launch the dynamic rolling pruning GPU task.
+ *
+ *  We currently alternate local/non-local list pruning in odd-even steps
+ *  (only pruning every second step without DD).
+ *
+ * \param[in]     cr               The communication record
+ * \param[in]     nbv              Nonbonded verlet structure
+ * \param[in]     inputrec         The input record
+ * \param[in]     step             The current MD step
+ */
+static inline void launchGpuRollingPruning(const t_commrec          *cr,
+                                           const nonbonded_verlet_t *nbv,
+                                           const t_inputrec         *inputrec,
+                                           const gmx_int64_t         step)
+{
+    /* We should not launch the rolling pruning kernel at a search
+     * step or just before search steps, since that's useless.
+     * Without domain decomposition we prune at even steps.
+     * With domain decomposition we alternate local and non-local
+     * pruning at even and odd steps.
+     */
+    int  numRollingParts     = nbv->listParams->numRollingParts;
+    GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
+    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+    bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
+    if (stepWithCurrentList > 0 &&
+        stepWithCurrentList < inputrec->nstlist - 1 &&
+        (stepIsEven || DOMAINDECOMP(cr)))
+    {
+        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+                                          stepIsEven ? eintLocal : eintNonlocal,
+                                          numRollingParts);
     }
 }
 
-void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
-                         t_inputrec *inputrec,
-                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-                         gmx_localtop_t *top,
-                         gmx_groups_t gmx_unused *groups,
-                         matrix box, rvec x[], history_t *hist,
-                         rvec f[],
-                         tensor vir_force,
-                         t_mdatoms *mdatoms,
-                         gmx_enerdata_t *enerd, t_fcdata *fcd,
-                         real *lambda, t_graph *graph,
-                         t_forcerec *fr, interaction_const_t *ic,
-                         gmx_vsite_t *vsite, rvec mu_tot,
-                         double t, FILE *field, gmx_edsam_t ed,
-                         gmx_bool bBornRadii,
-                         int flags)
+static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+                                t_inputrec *inputrec,
+                                gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                                gmx_localtop_t *top,
+                                gmx_groups_t gmx_unused *groups,
+                                matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+                                gmx::PaddedArrayRef<gmx::RVec> force,
+                                tensor vir_force,
+                                t_mdatoms *mdatoms,
+                                gmx_enerdata_t *enerd, t_fcdata *fcd,
+                                real *lambda, t_graph *graph,
+                                t_forcerec *fr, interaction_const_t *ic,
+                                gmx_vsite_t *vsite, rvec mu_tot,
+                                double t, gmx_edsam_t ed,
+                                gmx_bool bBornRadii,
+                                int flags,
+                                DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+                                DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
 {
     int                 cg1, i, j;
-    int                 start, homenr;
     double              mu[2*DIM];
     gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
     gmx_bool            bDoForces, bUseGPU, bUseOrEmulGPU;
-    gmx_bool            bDiffKernels = FALSE;
     rvec                vzero, box_diag;
-    float               cycles_pme, cycles_force, cycles_wait_gpu;
-    /* TODO To avoid loss of precision, float can't be used for a
-     * cycle count. Build an object that can do this right and perhaps
-     * also be used by gmx_wallcycle_t */
-    gmx_cycles_t        cycleCountBeforeLocalWorkCompletes = 0;
-    nonbonded_verlet_t *nbv;
-
-    cycles_force    = 0;
+    float               cycles_pme, cycles_wait_gpu;
+    nonbonded_verlet_t *nbv = fr->nbv;
+
+    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+    bFillGrid     = (bNS && bStateChanged);
+    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
+    bDoForces     = (flags & GMX_FORCE_FORCES);
+    bUseGPU       = fr->nbv->bUseGPU;
+    bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
+
+    const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
+    // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
+    const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
+        ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
+
+    /* At a search step we need to start the first balancing region
+     * somewhere early inside the step after communication during domain
+     * decomposition (and not during the previous step as usual).
+     */
+    if (bNS &&
+        ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+    {
+        ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
+    }
+
     cycles_wait_gpu = 0;
-    nbv             = fr->nbv;
 
-    start  = 0;
-    homenr = mdatoms->homenr;
+    const int start  = 0;
+    const int homenr = mdatoms->homenr;
 
     clear_mat(vir_force);
 
@@ -834,14 +1109,6 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         cg1--;
     }
 
-    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
-    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
-    bFillGrid     = (bNS && bStateChanged);
-    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
-    bDoForces     = (flags & GMX_FORCE_FORCES);
-    bUseGPU       = fr->nbv->bUseGPU;
-    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
-
     if (bStateChanged)
     {
         update_forcerec(fr, box);
@@ -869,67 +1136,47 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
         if (bCalcCGCM)
         {
-            put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+            put_atoms_in_box_omp(fr->ePBC, box, x.subArray(0, homenr));
             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
         }
         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
         {
-            unshift_self(graph, box, x);
+            unshift_self(graph, box, as_rvec_array(x.data()));
         }
     }
 
     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
-                                 fr->shift_vec, nbv->grp[0].nbat);
+                                 fr->shift_vec, nbv->nbat);
 
 #if GMX_MPI
-    if (!(cr->duty & DUTY_PME))
+    if (!thisRankHasDuty(cr, DUTY_PME))
     {
-        gmx_bool bBS;
-        matrix   boxs;
-
         /* Send particle coordinates to the pme nodes.
          * Since this is only implemented for domain decomposition
          * and domain decomposition does not use the graph,
          * we do not need to worry about shifting.
          */
-
-        int pme_flags = 0;
-
         wallcycle_start(wcycle, ewcPP_PMESENDX);
-
-        bBS = (inputrec->nwall == 2);
-        if (bBS)
-        {
-            copy_mat(box, boxs);
-            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-        }
-
-        if (EEL_PME(fr->eeltype))
-        {
-            pme_flags |= GMX_PME_DO_COULOMB;
-        }
-
-        if (EVDW_PME(fr->vdwtype))
-        {
-            pme_flags |= GMX_PME_DO_LJ;
-        }
-
-        gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
-                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+        gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
+                                 lambda[efptCOUL], lambda[efptVDW],
                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
-                                 pme_flags, step);
-
+                                 step);
         wallcycle_stop(wcycle, ewcPP_PMESENDX);
     }
 #endif /* GMX_MPI */
 
+    if (useGpuPme)
+    {
+        launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.data()), flags, wcycle);
+    }
+
     /* do gridding for pair search */
     if (bNS)
     {
         if (graph && bStateChanged)
         {
             /* Calculate intramolecular shift vectors to make molecules whole */
-            mk_mshift(fplog, graph, fr->ePBC, box, x);
+            mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
         }
 
         clear_rvec(vzero);
@@ -943,51 +1190,41 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
                               0, vzero, box_diag,
-                              0, mdatoms->homenr, -1, fr->cginfo, x,
-                              0, NULL,
+                              0, mdatoms->homenr, -1, fr->cginfo, as_rvec_array(x.data()),
+                              0, nullptr,
                               nbv->grp[eintLocal].kernel_type,
-                              nbv->grp[eintLocal].nbat);
+                              nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
         }
         else
         {
             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
-                                       fr->cginfo, x,
+                                       fr->cginfo, as_rvec_array(x.data()),
                                        nbv->grp[eintNonlocal].kernel_type,
-                                       nbv->grp[eintNonlocal].nbat);
+                                       nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
         }
 
-        if (nbv->ngrp == 1 ||
-            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
-        {
-            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
-                               nbv->nbs, mdatoms, fr->cginfo);
-        }
-        else
-        {
-            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
-                               nbv->nbs, mdatoms, fr->cginfo);
-            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
-                               nbv->nbs, mdatoms, fr->cginfo);
-        }
+        nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
         wallcycle_stop(wcycle, ewcNS);
     }
 
     /* initialize the GPU atom data and copy shift vector */
     if (bUseGPU)
     {
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
         if (bNS)
         {
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
         }
 
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
     /* do local pair search */
@@ -995,14 +1232,19 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     {
         wallcycle_start_nocount(wcycle, ewcNS);
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
-        nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+        nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
                             &top->excls,
-                            ic->rlist,
+                            nbv->listParams->rlistOuter,
                             nbv->min_ci_balanced,
                             &nbv->grp[eintLocal].nbl_lists,
                             eintLocal,
                             nbv->grp[eintLocal].kernel_type,
                             nrnb);
+        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        if (nbv->listParams->useDynamicPruning && !bUseGPU)
+        {
+            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+        }
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 
         if (bUseGPU)
@@ -1018,62 +1260,59 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     {
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
-        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
-                                        nbv->grp[eintLocal].nbat);
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, as_rvec_array(x.data()),
+                                        nbv->nbat);
         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
     }
 
     if (bUseGPU)
     {
-        wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
-        /* launch local nonbonded F on GPU */
+        if (DOMAINDECOMP(cr))
+        {
+            ddOpenBalanceRegionGpu(cr->dd);
+        }
+
+        wallcycle_start(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        /* launch local nonbonded work on GPU */
         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
-                     nrnb, wcycle);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+                     step, nrnb, wcycle);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    }
+
+    if (useGpuPme)
+    {
+        // In PME GPU and mixed mode we launch FFT / gather after the
+        // X copy/transform to allow overlap as well as after the GPU NB
+        // launch to avoid FFT launch overhead hijacking the CPU and delaying
+        // the nonbonded kernel.
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
     if (DOMAINDECOMP(cr))
     {
-        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
-                        nbv->grp[eintLocal].kernel_type);
-
-        if (bDiffKernels)
-        {
-            /* With GPU+CPU non-bonded calculations we need to copy
-             * the local coordinates to the non-local nbat struct
-             * (in CPU format) as the non-local kernel call also
-             * calculates the local - non-local interactions.
-             */
-            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
-            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
-                                            nbv->grp[eintNonlocal].nbat);
-            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
-            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
-        }
-
         if (bNS)
         {
             wallcycle_start_nocount(wcycle, ewcNS);
             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
-            if (bDiffKernels)
-            {
-                nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
-            }
-
-            nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+            nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
                                 &top->excls,
-                                ic->rlist,
+                                nbv->listParams->rlistOuter,
                                 nbv->min_ci_balanced,
                                 &nbv->grp[eintNonlocal].nbl_lists,
                                 eintNonlocal,
                                 nbv->grp[eintNonlocal].kernel_type,
                                 nrnb);
-
+            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            if (nbv->listParams->useDynamicPruning && !bUseGPU)
+            {
+                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+            }
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
@@ -1088,39 +1327,43 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         else
         {
             wallcycle_start(wcycle, ewcMOVEX);
-            dd_move_x(cr->dd, box, x);
+            dd_move_x(cr->dd, box, as_rvec_array(x.data()));
             wallcycle_stop(wcycle, ewcMOVEX);
 
             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
-                                            nbv->grp[eintNonlocal].nbat);
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, as_rvec_array(x.data()),
+                                            nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
-            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
         }
 
-        if (bUseGPU && !bDiffKernels)
+        if (bUseGPU)
         {
-            wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
-            /* launch non-local nonbonded F on GPU */
+            wallcycle_start(wcycle, ewcLAUNCH_GPU);
+            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            /* launch non-local nonbonded tasks on GPU */
             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
-                         nrnb, wcycle);
-            cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+                         step, nrnb, wcycle);
+            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
     }
 
     if (bUseGPU)
     {
         /* launch D2H copy-back F */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-        if (DOMAINDECOMP(cr) && !bDiffKernels)
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        if (DOMAINDECOMP(cr))
         {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
                                      flags, eatNonlocal);
         }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
                                  flags, eatLocal);
-        cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
     if (bStateChanged && inputrecNeedMutot(inputrec))
@@ -1128,6 +1371,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         if (PAR(cr))
         {
             gmx_sumd(2*DIM, mu, cr);
+            ddReopenBalanceRegionCpu(cr->dd);
         }
 
         for (i = 0; i < 2; i++)
@@ -1156,7 +1400,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     reset_enerdata(enerd);
     clear_rvecs(SHIFTS, fr->fshift);
 
-    if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
+    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
         wallcycle_start(wcycle, ewcPPDURINGPME);
         dd_force_flop_start(cr->dd, nrnb);
@@ -1164,60 +1408,43 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
     if (inputrec->bRot)
     {
-        /* Enforced rotation has its own cycle counter that starts after the collective
-         * coordinates have been communicated. It is added to ddCyclF to allow
-         * for proper load-balancing */
         wallcycle_start(wcycle, ewcROT);
-        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+        do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
         wallcycle_stop(wcycle, ewcROT);
     }
 
+    /* Temporary solution until all routines take PaddedRVecVector */
+    rvec *f = as_rvec_array(force.data());
+
     /* Start the force cycle counter.
-     * This counter is stopped after do_force_lowlevel.
-     * No parallel communication should occur while this counter is running,
-     * since that will interfere with the dynamic load balancing.
+     * Note that a different counter is used for dynamic load balancing.
      */
     wallcycle_start(wcycle, ewcFORCE);
+
+    gmx::ArrayRef<gmx::RVec> forceRef = force;
     if (bDoForces)
     {
-        /* Reset forces for which the virial is calculated separately:
-         * PME/Ewald forces if necessary */
-        if (fr->bF_NoVirSum)
+        /* If we need to compute the virial, we might need a separate
+         * force buffer for algorithms for which the virial is calculated
+         * directly, such as PME.
+         */
+        if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
         {
-            if (flags & GMX_FORCE_VIRIAL)
-            {
-                fr->f_novirsum = fr->f_novirsum_alloc;
-            }
-            else
-            {
-                /* We are not calculating the pressure so we do not need
-                 * a separate array for forces that do not contribute
-                 * to the pressure.
-                 */
-                fr->f_novirsum = f;
-            }
-        }
+            forceRef = *fr->forceBufferForDirectVirialContributions;
 
-        if (fr->bF_NoVirSum)
-        {
-            if (flags & GMX_FORCE_VIRIAL)
-            {
-                if (fr->bDomDec)
-                {
-                    clear_rvecs_omp(fr->f_novirsum_n, fr->f_novirsum);
-                }
-                else
-                {
-                    clear_rvecs_omp(homenr, fr->f_novirsum+start);
-                }
-            }
+            /* We only compute forces on local atoms. Note that vsites can
+             * spread to non-local atoms, but that part of the buffer is
+             * cleared separately in the vsite spreading code.
+             */
+            clear_rvecs_omp(mdatoms->homenr, as_rvec_array(forceRef.data()));
         }
         /* Clear the short- and long-range forces */
         clear_rvecs_omp(fr->natoms_force_constr, f);
-
-        clear_rvec(fr->vir_diag_posres);
     }
 
+    /* forceWithVirial uses the local atom range only */
+    ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
+
     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
     {
         clear_pull_forces(inputrec->pull_work);
@@ -1233,9 +1460,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
     if (!bUseOrEmulGPU)
     {
-        /* Maybe we should move this into do_force_lowlevel */
         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
-                     nrnb, wcycle);
+                     step, nrnb, wcycle);
     }
 
     if (fr->efep != efepNO)
@@ -1246,7 +1472,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
         {
             do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
-                             fr, x, f, mdatoms,
+                             fr, as_rvec_array(x.data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
         }
@@ -1255,21 +1481,20 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
         {
             do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
-                             fr, x, f, mdatoms,
+                             fr, as_rvec_array(x.data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
         }
     }
 
-    if (!bUseOrEmulGPU || bDiffKernels)
+    if (!bUseOrEmulGPU)
     {
         int aloc;
 
         if (DOMAINDECOMP(cr))
         {
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
-                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
-                         nrnb, wcycle);
+            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+                         step, nrnb, wcycle);
         }
 
         if (!bUseOrEmulGPU)
@@ -1285,12 +1510,12 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          * This can be split into a local and a non-local part when overlapping
          * communication with calculation with domain decomposition.
          */
-        cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+        wallcycle_stop(wcycle, ewcFORCE);
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
-        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
         /* if there are multiple fshift output buffers reduce them */
@@ -1299,7 +1524,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
                                                      fr->fshift);
         }
     }
@@ -1307,48 +1532,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     /* update QMMMrec, if necessary */
     if (fr->bQMMM)
     {
-        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+        update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
     }
 
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fr, inputrec, &(top->idef),
                       cr, nrnb, wcycle, mdatoms,
-                      x, hist, f, enerd, fcd, top, fr->born,
+                      as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
                       bBornRadii, box,
                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                       flags, &cycles_pme);
 
-    cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+    wallcycle_stop(wcycle, ewcFORCE);
 
-    if (ed)
-    {
-        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
-    }
+    computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
+                         fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
+                         flags, &forceWithVirial, enerd,
+                         ed, bNS);
 
-    if (bUseOrEmulGPU && !bDiffKernels)
+    if (bUseOrEmulGPU)
     {
         /* wait for non-local forces (or calculate in emulation mode) */
         if (DOMAINDECOMP(cr))
         {
             if (bUseGPU)
             {
-                float cycles_tmp;
-
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                                       flags, eatNonlocal,
-                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                       fr->fshift);
-                cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
-                cycles_wait_gpu += cycles_tmp;
-                cycles_force    += cycles_tmp;
+                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+                                           flags, eatNonlocal,
+                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                           fr->fshift);
+                cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
             }
             else
             {
                 wallcycle_start_nocount(wcycle, ewcFORCE);
                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
-                             nrnb, wcycle);
-                cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+                             step, nrnb, wcycle);
+                wallcycle_stop(wcycle, ewcFORCE);
             }
             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
@@ -1356,96 +1577,120 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
             {
                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
-                                               nbv->grp[eintNonlocal].nbat, f);
+                                               nbv->nbat, f);
             }
             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
-            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
         }
     }
 
-    if (bDoForces && DOMAINDECOMP(cr))
+    if (DOMAINDECOMP(cr))
     {
-        if (bUseGPU)
+        /* We are done with the CPU compute.
+         * We will now communicate the non-local forces.
+         * If we use a GPU this will overlap with GPU work, so in that case
+         * we do not close the DD force balancing region here.
+         */
+        if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
         {
-            /* We are done with the CPU compute, but the GPU local non-bonded
-             * kernel can still be running while we communicate the forces.
-             * We start a counter here, so we can, hopefully, time the rest
-             * of the GPU kernel execution and data transfer.
-             */
-            cycleCountBeforeLocalWorkCompletes = gmx_cycles_read();
+            ddCloseBalanceRegionCpu(cr->dd);
         }
+        if (bDoForces)
+        {
+            wallcycle_start(wcycle, ewcMOVEF);
+            dd_move_f(cr->dd, f, fr->fshift);
+            wallcycle_stop(wcycle, ewcMOVEF);
+        }
+    }
 
-        /* Communicate the forces */
-        wallcycle_start(wcycle, ewcMOVEF);
-        dd_move_f(cr->dd, f, fr->fshift);
-        wallcycle_stop(wcycle, ewcMOVEF);
+    // With both nonbonded and PME offloaded a GPU on the same rank, we use
+    // an alternating wait/reduction scheme.
+    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+    if (alternateGpuWait)
+    {
+        alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
     }
 
-    if (bUseOrEmulGPU)
+    if (!alternateGpuWait && useGpuPme)
     {
-        /* wait for local forces (or calculate in emulation mode) */
-        if (bUseGPU)
-        {
-            float       cycles_tmp, cycles_wait_est;
-            /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
-             * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
-             * but even with a step of 0.1 ms the difference is less than 1%
-             * of the step time.
-             */
-            const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
+        gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+        matrix vir_Q;
+        real   Vlr_q;
+        pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+        pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
+    }
 
-            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+    /* Wait for local GPU NB outputs on the non-alternating wait path */
+    if (!alternateGpuWait && bUseGPU)
+    {
+        /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
+         * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
+         * but even with a step of 0.1 ms the difference is less than 1%
+         * of the step time.
+         */
+        const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
+
+        wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
                                    flags, eatLocal,
                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                    fr->fshift);
-            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-
-            if (bDoForces && DOMAINDECOMP(cr))
-            {
-                cycles_wait_est = gmx_cycles_read() - cycleCountBeforeLocalWorkCompletes;
+        float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
-                if (cycles_tmp < gpuWaitApiOverheadMargin)
-                {
-                    /* We measured few cycles, it could be that the kernel
-                     * and transfer finished earlier and there was no actual
-                     * wait time, only API call overhead.
-                     * Then the actual time could be anywhere between 0 and
-                     * cycles_wait_est. As a compromise, we use half the time.
-                     */
-                    cycles_wait_est *= 0.5f;
-                }
-            }
-            else
+        if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
+        {
+            DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
+            if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
             {
-                /* No force communication so we actually timed the wait */
-                cycles_wait_est = cycles_tmp;
+                /* We measured few cycles, it could be that the kernel
+                 * and transfer finished earlier and there was no actual
+                 * wait time, only API call overhead.
+                 * Then the actual time could be anywhere between 0 and
+                 * cycles_wait_est. We will use half of cycles_wait_est.
+                 */
+                waitedForGpu = DdBalanceRegionWaitedForGpu::no;
             }
-            /* Even though this is after dd_move_f, the actual task we are
-             * waiting for runs asynchronously with dd_move_f and we usually
-             * have nothing to balance it with, so we can and should add
-             * the time to the force time for load balancing.
-             */
-            cycles_force    += cycles_wait_est;
-            cycles_wait_gpu += cycles_wait_est;
-
-            /* now clear the GPU outputs while we finish the step on the CPU */
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-            nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+            ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
         }
-        else
+    }
+
+    if (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes)
+    {
+        // NOTE: emulation kernel is not included in the balancing region,
+        // but emulation mode does not target performance anyway
+        wallcycle_start_nocount(wcycle, ewcFORCE);
+        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+                     DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+                     step, nrnb, wcycle);
+        wallcycle_stop(wcycle, ewcFORCE);
+    }
+
+    if (bUseGPU)
+    {
+        /* now clear the GPU outputs while we finish the step on the CPU */
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+
+        /* Is dynamic pair-list pruning activated? */
+        if (nbv->listParams->useDynamicPruning)
         {
-            wallcycle_start_nocount(wcycle, ewcFORCE);
-            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
-                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
-                         nrnb, wcycle);
-            wallcycle_stop(wcycle, ewcFORCE);
+            launchGpuRollingPruning(cr, nbv, inputrec, step);
         }
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+
+        // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
+    }
+
+    /* Do the nonbonded GPU (or emulation) force buffer reduction
+     * on the non-alternating path. */
+    if (bUseOrEmulGPU && !alternateGpuWait)
+    {
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
-                                       nbv->grp[eintLocal].nbat, f);
+                                       nbv->nbat, f);
         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
     }
@@ -1453,33 +1698,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     if (DOMAINDECOMP(cr))
     {
         dd_force_flop_stop(cr->dd, nrnb);
-        if (wcycle)
-        {
-            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
-            if (bUseGPU)
-            {
-                dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
-            }
-        }
     }
 
     if (bDoForces)
     {
-        if (inputrecElecField(inputrec))
-        {
-            /* Compute forces due to electric field */
-            calc_f_el(MASTER(cr) ? field : NULL,
-                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
-                      inputrec->ex, inputrec->et, t);
-        }
-
         /* If we have NoVirSum forces, but we do not calculate the virial,
          * we sum fr->f_novirsum=f later.
          */
-        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+        if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
         {
             wallcycle_start(wcycle, ewcVSITESPREAD);
-            spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+            spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
             wallcycle_stop(wcycle, ewcVSITESPREAD);
         }
@@ -1487,44 +1716,24 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         if (flags & GMX_FORCE_VIRIAL)
         {
             /* Calculation of the virial must be done after vsites! */
-            calc_virial(0, mdatoms->homenr, x, f,
+            calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
         }
     }
 
-    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
-    {
-        /* Since the COM pulling is always done mass-weighted, no forces are
-         * applied to vsites and this call can be done after vsite spreading.
-         */
-        pull_potential_wrapper(cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t,
-                               wcycle);
-    }
-
-    /* Add the forces from enforced rotation potentials (if any) */
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROTadd);
-        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
-        wallcycle_stop(wcycle, ewcROTadd);
-    }
-
-    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
-    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
-
-    if (PAR(cr) && !(cr->duty & DUTY_PME))
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(cr, wcycle, enerd, fr);
+        pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
     }
 
     if (bDoForces)
     {
         post_process_forces(cr, step, nrnb, wcycle,
-                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+                            top, box, as_rvec_array(x.data()), f, &forceWithVirial,
+                            vir_force, mdatoms, graph, fr, vsite,
                             flags);
     }
 
@@ -1535,36 +1744,37 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
         if (!EI_TPI(inputrec->eI))
         {
-            checkPotentialEnergyValidity(enerd);
+            checkPotentialEnergyValidity(step, *enerd, *inputrec);
         }
     }
 }
 
-void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
-                        t_inputrec *inputrec,
-                        gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-                        gmx_localtop_t *top,
-                        gmx_groups_t *groups,
-                        matrix box, rvec x[], history_t *hist,
-                        rvec f[],
-                        tensor vir_force,
-                        t_mdatoms *mdatoms,
-                        gmx_enerdata_t *enerd, t_fcdata *fcd,
-                        real *lambda, t_graph *graph,
-                        t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
-                        double t, FILE *field, gmx_edsam_t ed,
-                        gmx_bool bBornRadii,
-                        int flags)
+static void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+                               t_inputrec *inputrec,
+                               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                               gmx_localtop_t *top,
+                               gmx_groups_t *groups,
+                               matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+                               gmx::PaddedArrayRef<gmx::RVec> force,
+                               tensor vir_force,
+                               t_mdatoms *mdatoms,
+                               gmx_enerdata_t *enerd, t_fcdata *fcd,
+                               real *lambda, t_graph *graph,
+                               t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+                               double t, gmx_edsam_t ed,
+                               gmx_bool bBornRadii,
+                               int flags,
+                               DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+                               DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
 {
     int        cg0, cg1, i, j;
-    int        start, homenr;
     double     mu[2*DIM];
     gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
     gmx_bool   bDoForces;
-    float      cycles_pme, cycles_force;
+    float      cycles_pme;
 
-    start  = 0;
-    homenr = mdatoms->homenr;
+    const int  start  = 0;
+    const int  homenr = mdatoms->homenr;
 
     clear_mat(vir_force);
 
@@ -1617,18 +1827,18 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
         if (bCalcCGCM)
         {
             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
-                                     &(top->cgs), x, fr->cg_cm);
+                                     &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
             inc_nrnb(nrnb, eNR_CGCM, homenr);
             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
         }
         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
         {
-            unshift_self(graph, box, x);
+            unshift_self(graph, box, as_rvec_array(x.data()));
         }
     }
     else if (bCalcCGCM)
     {
-        calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+        calc_cgcm(fplog, cg0, cg1, &(top->cgs), as_rvec_array(x.data()), fr->cg_cm);
         inc_nrnb(nrnb, eNR_CGCM, homenr);
     }
 
@@ -1638,43 +1848,18 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
     }
 
 #if GMX_MPI
-    if (!(cr->duty & DUTY_PME))
+    if (!thisRankHasDuty(cr, DUTY_PME))
     {
-        gmx_bool bBS;
-        matrix   boxs;
-
         /* Send particle coordinates to the pme nodes.
          * Since this is only implemented for domain decomposition
          * and domain decomposition does not use the graph,
          * we do not need to worry about shifting.
          */
-
-        int pme_flags = 0;
-
         wallcycle_start(wcycle, ewcPP_PMESENDX);
-
-        bBS = (inputrec->nwall == 2);
-        if (bBS)
-        {
-            copy_mat(box, boxs);
-            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-        }
-
-        if (EEL_PME(fr->eeltype))
-        {
-            pme_flags |= GMX_PME_DO_COULOMB;
-        }
-
-        if (EVDW_PME(fr->vdwtype))
-        {
-            pme_flags |= GMX_PME_DO_LJ;
-        }
-
-        gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
-                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+        gmx_pme_send_coordinates(cr, box, as_rvec_array(x.data()),
+                                 lambda[efptCOUL], lambda[efptVDW],
                                  (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
-                                 pme_flags, step);
-
+                                 step);
         wallcycle_stop(wcycle, ewcPP_PMESENDX);
     }
 #endif /* GMX_MPI */
@@ -1683,8 +1868,13 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
     if (DOMAINDECOMP(cr))
     {
         wallcycle_start(wcycle, ewcMOVEX);
-        dd_move_x(cr->dd, box, x);
+        dd_move_x(cr->dd, box, as_rvec_array(x.data()));
         wallcycle_stop(wcycle, ewcMOVEX);
+        /* No GPU support, no move_x overlap, so reopen the balance region here */
+        if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+        {
+            ddReopenBalanceRegionCpu(cr->dd);
+        }
     }
 
     if (inputrecNeedMutot(inputrec))
@@ -1694,6 +1884,7 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
             if (PAR(cr))
             {
                 gmx_sumd(2*DIM, mu, cr);
+                ddReopenBalanceRegionCpu(cr->dd);
             }
             for (i = 0; i < 2; i++)
             {
@@ -1728,7 +1919,7 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
         if (graph && bStateChanged)
         {
             /* Calculate intramolecular shift vectors to make molecules whole */
-            mk_mshift(fplog, graph, fr->ePBC, box, x);
+            mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.data()));
         }
 
         /* Do the actual neighbour searching */
@@ -1742,10 +1933,10 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
     if (inputrec->implicit_solvent && bNS)
     {
         make_gb_nblist(cr, inputrec->gb_algorithm,
-                       x, box, fr, &top->idef, graph, fr->born);
+                       as_rvec_array(x.data()), box, fr, &top->idef, graph, fr->born);
     }
 
-    if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
+    if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
         wallcycle_start(wcycle, ewcPPDURINGPME);
         dd_force_flop_start(cr->dd, nrnb);
@@ -1753,54 +1944,40 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
 
     if (inputrec->bRot)
     {
-        /* Enforced rotation has its own cycle counter that starts after the collective
-         * coordinates have been communicated. It is added to ddCyclF to allow
-         * for proper load-balancing */
         wallcycle_start(wcycle, ewcROT);
-        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+        do_rotation(cr, inputrec, box, as_rvec_array(x.data()), t, step, bNS);
         wallcycle_stop(wcycle, ewcROT);
     }
 
+    /* Temporary solution until all routines take PaddedRVecVector */
+    rvec *f = as_rvec_array(force.data());
+
     /* Start the force cycle counter.
-     * This counter is stopped after do_force_lowlevel.
-     * No parallel communication should occur while this counter is running,
-     * since that will interfere with the dynamic load balancing.
+     * Note that a different counter is used for dynamic load balancing.
      */
     wallcycle_start(wcycle, ewcFORCE);
 
+    gmx::ArrayRef<gmx::RVec> forceRef = force;
     if (bDoForces)
     {
-        /* Reset forces for which the virial is calculated separately:
-         * PME/Ewald forces if necessary */
-        if (fr->bF_NoVirSum)
+        /* If we need to compute the virial, we might need a separate
+         * force buffer for algorithms for which the virial is calculated
+         * separately, such as PME.
+         */
+        if ((flags & GMX_FORCE_VIRIAL) && fr->haveDirectVirialContributions)
         {
-            if (flags & GMX_FORCE_VIRIAL)
-            {
-                fr->f_novirsum = fr->f_novirsum_alloc;
-                if (fr->bDomDec)
-                {
-                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
-                }
-                else
-                {
-                    clear_rvecs(homenr, fr->f_novirsum+start);
-                }
-            }
-            else
-            {
-                /* We are not calculating the pressure so we do not need
-                 * a separate array for forces that do not contribute
-                 * to the pressure.
-                 */
-                fr->f_novirsum = f;
-            }
+            forceRef = *fr->forceBufferForDirectVirialContributions;
+
+            clear_rvecs_omp(forceRef.size(), as_rvec_array(forceRef.data()));
         }
 
         /* Clear the short- and long-range forces */
         clear_rvecs(fr->natoms_force_constr, f);
-
-        clear_rvec(fr->vir_diag_posres);
     }
+
+    /* forceWithVirial might need the full force atom range */
+    ForceWithVirial forceWithVirial(forceRef, flags & GMX_FORCE_VIRIAL);
+
     if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
     {
         clear_pull_forces(inputrec->pull_work);
@@ -1809,45 +1986,38 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
     /* update QMMMrec, if necessary */
     if (fr->bQMMM)
     {
-        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+        update_QMMMrec(cr, fr, as_rvec_array(x.data()), mdatoms, box);
     }
 
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fr, inputrec, &(top->idef),
                       cr, nrnb, wcycle, mdatoms,
-                      x, hist, f, enerd, fcd, top, fr->born,
+                      as_rvec_array(x.data()), hist, f, &forceWithVirial, enerd, fcd, top, fr->born,
                       bBornRadii, box,
                       inputrec->fepvals, lambda,
                       graph, &(top->excls), fr->mu_tot,
                       flags,
                       &cycles_pme);
 
-    cycles_force = wallcycle_stop(wcycle, ewcFORCE);
-
-    if (ed)
-    {
-        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
-    }
+    wallcycle_stop(wcycle, ewcFORCE);
 
     if (DOMAINDECOMP(cr))
     {
         dd_force_flop_stop(cr->dd, nrnb);
-        if (wcycle)
+
+        if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
         {
-            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+            ddCloseBalanceRegionCpu(cr->dd);
         }
     }
 
+    computeSpecialForces(fplog, cr, inputrec, step, t, wcycle,
+                         fr->forceProviders, box, as_rvec_array(x.data()), mdatoms, lambda,
+                         flags, &forceWithVirial, enerd,
+                         ed, bNS);
+
     if (bDoForces)
     {
-        if (inputrecElecField(inputrec))
-        {
-            /* Compute forces due to electric field */
-            calc_f_el(MASTER(cr) ? field : NULL,
-                      start, homenr, mdatoms->chargeA, fr->f_novirsum,
-                      inputrec->ex, inputrec->et, t);
-        }
-
         /* Communicate the forces */
         if (DOMAINDECOMP(cr))
         {
@@ -1860,10 +2030,10 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
              * When we do not calculate the virial, fr->f_novirsum = f,
              * so we have already communicated these forces.
              */
-            if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+            if (EEL_FULL(fr->ic->eeltype) && cr->dd->n_intercg_excl &&
                 (flags & GMX_FORCE_VIRIAL))
             {
-                dd_move_f(cr->dd, fr->f_novirsum, NULL);
+                dd_move_f(cr->dd, as_rvec_array(forceWithVirial.force_.data()), nullptr);
             }
             wallcycle_stop(wcycle, ewcMOVEF);
         }
@@ -1871,10 +2041,10 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
         /* If we have NoVirSum forces, but we do not calculate the virial,
          * we sum fr->f_novirsum=f later.
          */
-        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+        if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
         {
             wallcycle_start(wcycle, ewcVSITESPREAD);
-            spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+            spread_vsite_f(vsite, as_rvec_array(x.data()), f, fr->fshift, FALSE, nullptr, nrnb,
                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
             wallcycle_stop(wcycle, ewcVSITESPREAD);
         }
@@ -1882,41 +2052,24 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
         if (flags & GMX_FORCE_VIRIAL)
         {
             /* Calculation of the virial must be done after vsites! */
-            calc_virial(0, mdatoms->homenr, x, f,
+            calc_virial(0, mdatoms->homenr, as_rvec_array(x.data()), f,
                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
         }
     }
 
-    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
-    {
-        pull_potential_wrapper(cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t,
-                               wcycle);
-    }
-
-    /* Add the forces from enforced rotation potentials (if any) */
-    if (inputrec->bRot)
-    {
-        wallcycle_start(wcycle, ewcROTadd);
-        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
-        wallcycle_stop(wcycle, ewcROTadd);
-    }
-
-    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
-    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
-
-    if (PAR(cr) && !(cr->duty & DUTY_PME))
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(cr, wcycle, enerd, fr);
+        pme_receive_force_ener(cr, &forceWithVirial, enerd, wcycle);
     }
 
     if (bDoForces)
     {
         post_process_forces(cr, step, nrnb, wcycle,
-                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+                            top, box, as_rvec_array(x.data()), f, &forceWithVirial,
+                            vir_force, mdatoms, graph, fr, vsite,
                             flags);
     }
 
@@ -1927,7 +2080,7 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
 
         if (!EI_TPI(inputrec->eI))
         {
-            checkPotentialEnergyValidity(enerd);
+            checkPotentialEnergyValidity(step, *enerd, *inputrec);
         }
     }
 
@@ -1938,17 +2091,19 @@ void do_force(FILE *fplog, t_commrec *cr,
               gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
               gmx_localtop_t *top,
               gmx_groups_t *groups,
-              matrix box, rvec x[], history_t *hist,
-              rvec f[],
+              matrix box, gmx::PaddedArrayRef<gmx::RVec> x, history_t *hist,
+              gmx::PaddedArrayRef<gmx::RVec> force,
               tensor vir_force,
               t_mdatoms *mdatoms,
               gmx_enerdata_t *enerd, t_fcdata *fcd,
-              real *lambda, t_graph *graph,
+              gmx::ArrayRef<real> lambda, t_graph *graph,
               t_forcerec *fr,
               gmx_vsite_t *vsite, rvec mu_tot,
-              double t, FILE *field, gmx_edsam_t ed,
+              double t, gmx_edsam_t ed,
               gmx_bool bBornRadii,
-              int flags)
+              int flags,
+              DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
+              DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
 {
     /* modify force flag if not doing nonbonded */
     if (!fr->bNonbonded)
@@ -1956,6 +2111,9 @@ void do_force(FILE *fplog, t_commrec *cr,
         flags &= ~GMX_FORCE_NONBONDED;
     }
 
+    GMX_ASSERT(x.size()     >= gmx::paddedRVecVectorSize(fr->natoms_force), "coordinates should be padded");
+    GMX_ASSERT(force.size() >= gmx::paddedRVecVectorSize(fr->natoms_force), "force should be padded");
+
     switch (inputrec->cutoff_scheme)
     {
         case ecutsVERLET:
@@ -1964,15 +2122,17 @@ void do_force(FILE *fplog, t_commrec *cr,
                                 top,
                                 groups,
                                 box, x, hist,
-                                f, vir_force,
+                                force, vir_force,
                                 mdatoms,
                                 enerd, fcd,
-                                lambda, graph,
+                                lambda.data(), graph,
                                 fr, fr->ic,
                                 vsite, mu_tot,
-                                t, field, ed,
+                                t, ed,
                                 bBornRadii,
-                                flags);
+                                flags,
+                                ddOpenBalanceRegion,
+                                ddCloseBalanceRegion);
             break;
         case ecutsGROUP:
             do_force_cutsGROUP(fplog, cr, inputrec,
@@ -1980,18 +2140,31 @@ void do_force(FILE *fplog, t_commrec *cr,
                                top,
                                groups,
                                box, x, hist,
-                               f, vir_force,
+                               force, vir_force,
                                mdatoms,
                                enerd, fcd,
-                               lambda, graph,
+                               lambda.data(), graph,
                                fr, vsite, mu_tot,
-                               t, field, ed,
+                               t, ed,
                                bBornRadii,
-                               flags);
+                               flags,
+                               ddOpenBalanceRegion,
+                               ddCloseBalanceRegion);
             break;
         default:
             gmx_incons("Invalid cut-off scheme passed!");
     }
+
+    /* In case we don't have constraints and are using GPUs, the next balancing
+     * region starts here.
+     * Some "special" work at the end of do_force_cuts?, such as vsite spread,
+     * virial calculation and COM pulling, is not thus not included in
+     * the balance timing, which is ok as most tasks do communication.
+     */
+    if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
+    {
+        ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
+    }
 }
 
 
@@ -2030,22 +2203,22 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
     dvdl_dum = 0;
 
     /* constrain the current position */
-    constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+    constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
               ir, cr, step, 0, 1.0, md,
-              state->x, state->x, NULL,
+              as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
               fr->bMolPBC, state->box,
               state->lambda[efptBONDED], &dvdl_dum,
-              NULL, NULL, nrnb, econqCoord);
+              nullptr, nullptr, nrnb, econqCoord);
     if (EI_VV(ir->eI))
     {
         /* constrain the inital velocity, and save it */
         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
-        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+        constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
                   ir, cr, step, 0, 1.0, md,
-                  state->x, state->v, state->v,
+                  as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
                   fr->bMolPBC, state->box,
                   state->lambda[efptBONDED], &dvdl_dum,
-                  NULL, NULL, nrnb, econqVeloc);
+                  nullptr, nullptr, nrnb, econqVeloc);
     }
     /* constrain the inital velocities at t-dt/2 */
     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
@@ -2070,12 +2243,12 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
                     gmx_step_str(step, buf));
         }
         dvdl_dum = 0;
-        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+        constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
                   ir, cr, step, -1, 1.0, md,
-                  state->x, savex, NULL,
+                  as_rvec_array(state->x.data()), savex, nullptr,
                   fr->bMolPBC, state->box,
                   state->lambda[efptBONDED], &dvdl_dum,
-                  state->v, NULL, nrnb, econqCoord);
+                  as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
 
         for (i = start; i < end; i++)
         {
@@ -2172,6 +2345,8 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
     fr->virdiffsix      = 0;
     fr->virdifftwelve   = 0;
 
+    const interaction_const_t *ic = fr->ic;
+
     if (eDispCorr != edispcNO)
     {
         for (i = 0; i < 2; i++)
@@ -2179,19 +2354,19 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
             eners[i] = 0;
             virs[i]  = 0;
         }
-        if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
-            (fr->vdw_modifier == eintmodPOTSWITCH) ||
-            (fr->vdw_modifier == eintmodFORCESWITCH) ||
-            (fr->vdwtype == evdwSHIFT) ||
-            (fr->vdwtype == evdwSWITCH))
+        if ((ic->vdw_modifier == eintmodPOTSHIFT) ||
+            (ic->vdw_modifier == eintmodPOTSWITCH) ||
+            (ic->vdw_modifier == eintmodFORCESWITCH) ||
+            (ic->vdwtype == evdwSHIFT) ||
+            (ic->vdwtype == evdwSWITCH))
         {
-            if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
-                 (fr->vdw_modifier == eintmodFORCESWITCH) ||
-                 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
+            if (((ic->vdw_modifier == eintmodPOTSWITCH) ||
+                 (ic->vdw_modifier == eintmodFORCESWITCH) ||
+                 (ic->vdwtype == evdwSWITCH)) && ic->rvdw_switch == 0)
             {
                 gmx_fatal(FARGS,
                           "With dispersion correction rvdw-switch can not be zero "
-                          "for vdw-type = %s", evdw_names[fr->vdwtype]);
+                          "for vdw-type = %s", evdw_names[ic->vdwtype]);
             }
 
             /* TODO This code depends on the logic in tables.c that
@@ -2203,8 +2378,8 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
             vdwtab = fr->dispersionCorrectionTable->data;
 
             /* Round the cut-offs to exact table values for precision */
-            ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
-            ri1  = static_cast<int>(ceil(fr->rvdw*scale));
+            ri0  = static_cast<int>(floor(ic->rvdw_switch*scale));
+            ri1  = static_cast<int>(ceil(ic->rvdw*scale));
 
             /* The code below has some support for handling force-switching, i.e.
              * when the force (instead of potential) is switched over a limited
@@ -2222,14 +2397,14 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
              * we need to calculate the constant shift up to the point where we
              * start modifying the potential.
              */
-            ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
+            ri0  = (ic->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
 
             r0   = ri0/scale;
             rc3  = r0*r0*r0;
             rc9  = rc3*rc3*rc3;
 
-            if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
-                (fr->vdwtype == evdwSHIFT))
+            if ((ic->vdw_modifier == eintmodFORCESWITCH) ||
+                (ic->vdwtype == evdwSHIFT))
             {
                 /* Determine the constant energy shift below rvdw_switch.
                  * Table has a scale factor since we have scaled it down to compensate
@@ -2238,7 +2413,7 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
             }
-            else if (fr->vdw_modifier == eintmodPOTSHIFT)
+            else if (ic->vdw_modifier == eintmodPOTSHIFT)
             {
                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
@@ -2278,11 +2453,11 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
             virs[0]  +=  8.0*M_PI/rc3;
             virs[1]  += -16.0*M_PI/(3.0*rc9);
         }
-        else if (fr->vdwtype == evdwCUT ||
-                 EVDW_PME(fr->vdwtype) ||
-                 fr->vdwtype == evdwUSER)
+        else if (ic->vdwtype == evdwCUT ||
+                 EVDW_PME(ic->vdwtype) ||
+                 ic->vdwtype == evdwUSER)
         {
-            if (fr->vdwtype == evdwUSER && fplog)
+            if (ic->vdwtype == evdwUSER && fplog)
             {
                 fprintf(fplog,
                         "WARNING: using dispersion correction with user tables\n");
@@ -2295,12 +2470,12 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
              * can be used here.
              */
 
-            rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
+            rc3  = ic->rvdw*ic->rvdw*ic->rvdw;
             rc9  = rc3*rc3*rc3;
             /* Contribution beyond the cut-off */
             eners[0] += -4.0*M_PI/(3.0*rc3);
             eners[1] +=  4.0*M_PI/(9.0*rc9);
-            if (fr->vdw_modifier == eintmodPOTSHIFT)
+            if (ic->vdw_modifier == eintmodPOTSHIFT)
             {
                 /* Contribution within the cut-off */
                 eners[0] += -4.0*M_PI/(3.0*rc3);
@@ -2314,20 +2489,20 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
         {
             gmx_fatal(FARGS,
                       "Dispersion correction is not implemented for vdw-type = %s",
-                      evdw_names[fr->vdwtype]);
+                      evdw_names[ic->vdwtype]);
         }
 
         /* When we deprecate the group kernels the code below can go too */
-        if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
+        if (ic->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
         {
             /* Calculate self-interaction coefficient (assuming that
              * the reciprocal-space contribution is constant in the
              * region that contributes to the self-interaction).
              */
-            fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
+            fr->enershiftsix = gmx::power6(ic->ewaldcoeff_lj) / 6.0;
 
-            eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
-            virs[0]  +=  gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
+            eners[0] += -gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj)/3.0;
+            virs[0]  +=  gmx::power3(std::sqrt(M_PI)*ic->ewaldcoeff_lj);
         }
 
         fr->enerdiffsix    = eners[0];
@@ -2446,40 +2621,7 @@ void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
     }
 }
 
-void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
-                  t_graph *graph, rvec x[])
-{
-    if (fplog)
-    {
-        fprintf(fplog, "Removing pbc first time\n");
-    }
-    calc_shifts(box, fr->shift_vec);
-    if (graph)
-    {
-        mk_mshift(fplog, graph, fr->ePBC, box, x);
-        if (gmx_debug_at)
-        {
-            p_graph(debug, "do_pbc_first 1", graph);
-        }
-        shift_self(graph, box, x);
-        /* By doing an extra mk_mshift the molecules that are broken
-         * because they were e.g. imported from another software
-         * will be made whole again. Such are the healing powers
-         * of GROMACS.
-         */
-        mk_mshift(fplog, graph, fr->ePBC, box, x);
-        if (gmx_debug_at)
-        {
-            p_graph(debug, "do_pbc_first 2", graph);
-        }
-    }
-    if (fplog)
-    {
-        fprintf(fplog, "Done rmpbc\n");
-    }
-}
-
-static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+static void low_do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
                             const gmx_mtop_t *mtop, rvec x[],
                             gmx_bool bFirst)
 {
@@ -2506,7 +2648,7 @@ static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
         else
         {
             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
-            mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+            mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
                            0, molb->natoms_mol, FALSE, FALSE, graph);
 
             for (mol = 0; mol < molb->nmol; mol++)
@@ -2527,19 +2669,19 @@ static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
     sfree(graph);
 }
 
-void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+void do_pbc_first_mtop(FILE *fplog, int ePBC, const matrix box,
                        const gmx_mtop_t *mtop, rvec x[])
 {
     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
 }
 
-void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
-                 gmx_mtop_t *mtop, rvec x[])
+void do_pbc_mtop(FILE *fplog, int ePBC, const matrix box,
+                 const gmx_mtop_t *mtop, rvec x[])
 {
     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
 }
 
-void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
+void put_atoms_in_box_omp(int ePBC, const matrix box, gmx::ArrayRef<gmx::RVec> x)
 {
     int t, nth;
     nth = gmx_omp_nthreads_get(emntDefault);
@@ -2549,25 +2691,25 @@ void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
     {
         try
         {
-            int offset, len;
-
-            offset = (natoms*t    )/nth;
-            len    = (natoms*(t + 1))/nth - offset;
-            put_atoms_in_box(ePBC, box, len, x + offset);
+            size_t natoms = x.size();
+            size_t offset = (natoms*t    )/nth;
+            size_t len    = (natoms*(t + 1))/nth - offset;
+            put_atoms_in_box(ePBC, box, x.subArray(offset, len));
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 }
 
 // TODO This can be cleaned up a lot, and move back to runner.cpp
-void finish_run(FILE *fplog, t_commrec *cr,
-                t_inputrec *inputrec,
+void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
+                const t_inputrec *inputrec,
                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
                 nonbonded_verlet_t *nbv,
+                gmx_pme_t *pme,
                 gmx_bool bWriteStat)
 {
-    t_nrnb *nrnb_tot = NULL;
+    t_nrnb *nrnb_tot = nullptr;
     double  delta_t  = 0;
     double  nbfs     = 0, mflop = 0;
     double  elapsed_time,
@@ -2591,8 +2733,7 @@ void finish_run(FILE *fplog, t_commrec *cr,
 
     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
     {
-        md_print_warn(cr, fplog,
-                      "Simulation ended prematurely, no performance report will be written.");
+        GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
         printReport = false;
     }
 
@@ -2640,7 +2781,7 @@ void finish_run(FILE *fplog, t_commrec *cr,
         sfree(nrnb_tot);
     }
 
-    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+    if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
     {
         print_dd_statistics(cr, inputrec, fplog);
     }
@@ -2651,16 +2792,22 @@ void finish_run(FILE *fplog, t_commrec *cr,
      * to task parallelism. */
     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
-    wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
+    wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
     auto cycle_sum(wallcycle_sum(cr, wcycle));
 
     if (printReport)
     {
-        struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
-
-        wallcycle_print(fplog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
+        if (pme_gpu_task_enabled(pme))
+        {
+            pme_gpu_get_timings(pme, &pme_gpu_timings);
+        }
+        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
                         elapsed_time_over_all_ranks,
-                        wcycle, cycle_sum, gputimes);
+                        wcycle, cycle_sum,
+                        nbnxn_gpu_timings,
+                        &pme_gpu_timings);
 
         if (EI_DYNAMICS(inputrec->eI))
         {
@@ -2684,71 +2831,60 @@ void finish_run(FILE *fplog, t_commrec *cr,
     }
 }
 
-extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
 {
     /* this function works, but could probably use a logic rewrite to keep all the different
        types of efep straight. */
 
-    int       i;
+    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+    {
+        return;
+    }
+
     t_lambda *fep = ir->fepvals;
+    *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
+                                            if checkpoint is set -- a kludge is in for now
+                                            to prevent this.*/
 
-    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+    for (int i = 0; i < efptNR; i++)
     {
-        for (i = 0; i < efptNR; i++)
+        /* overwrite lambda state with init_lambda for now for backwards compatibility */
+        if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
         {
-            lambda[i] = 0.0;
+            lambda[i] = fep->init_lambda;
             if (lam0)
             {
-                lam0[i] = 0.0;
+                lam0[i] = lambda[i];
             }
         }
-        return;
-    }
-    else
-    {
-        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
-                                             if checkpoint is set -- a kludge is in for now
-                                             to prevent this.*/
-        for (i = 0; i < efptNR; i++)
+        else
         {
-            /* overwrite lambda state with init_lambda for now for backwards compatibility */
-            if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+            lambda[i] = fep->all_lambda[i][*fep_state];
+            if (lam0)
             {
-                lambda[i] = fep->init_lambda;
-                if (lam0)
-                {
-                    lam0[i] = lambda[i];
-                }
-            }
-            else
-            {
-                lambda[i] = fep->all_lambda[i][*fep_state];
-                if (lam0)
-                {
-                    lam0[i] = lambda[i];
-                }
+                lam0[i] = lambda[i];
             }
         }
-        if (ir->bSimTemp)
+    }
+    if (ir->bSimTemp)
+    {
+        /* need to rescale control temperatures to match current state */
+        for (int i = 0; i < ir->opts.ngtc; i++)
         {
-            /* need to rescale control temperatures to match current state */
-            for (i = 0; i < ir->opts.ngtc; i++)
+            if (ir->opts.ref_t[i] > 0)
             {
-                if (ir->opts.ref_t[i] > 0)
-                {
-                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
-                }
+                ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
             }
         }
     }
 
     /* Send to the log the information on the current lambdas */
-    if (fplog != NULL)
+    if (fplog != nullptr)
     {
         fprintf(fplog, "Initial vector of lambda components:[ ");
-        for (i = 0; i < efptNR; i++)
+        for (const auto &l : lambda)
         {
-            fprintf(fplog, "%10.4f ", lambda[i]);
+            fprintf(fplog, "%10.4f ", l);
         }
         fprintf(fplog, "]\n");
     }
@@ -2757,15 +2893,17 @@ extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real
 
 
 void init_md(FILE *fplog,
-             t_commrec *cr, t_inputrec *ir, const gmx_output_env_t *oenv,
+             t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
+             t_inputrec *ir, const gmx_output_env_t *oenv,
+             const MdrunOptions &mdrunOptions,
              double *t, double *t0,
-             real *lambda, int *fep_state, double *lam0,
+             t_state *globalState, double *lam0,
              t_nrnb *nrnb, gmx_mtop_t *mtop,
              gmx_update_t **upd,
              int nfile, const t_filenm fnm[],
              gmx_mdoutf_t *outf, t_mdebin **mdebin,
              tensor force_vir, tensor shake_vir, rvec mu_tot,
-             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
+             gmx_bool *bSimAnn, t_vcm **vcm,
              gmx_wallcycle_t wcycle)
 {
     int  i;
@@ -2784,7 +2922,20 @@ void init_md(FILE *fplog,
     }
 
     /* Initialize lambda variables */
-    initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+    /* TODO: Clean up initialization of fep_state and lambda in t_state.
+     * We currently need to call initialize_lambdas on non-master ranks
+     * to initialize lam0.
+     */
+    if (MASTER(cr))
+    {
+        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
+    }
+    else
+    {
+        int                      tmpFepState;
+        std::array<real, efptNR> tmpLambda;
+        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
+    }
 
     // TODO upd is never NULL in practice, but the analysers don't know that
     if (upd)
@@ -2793,15 +2944,15 @@ void init_md(FILE *fplog,
     }
     if (*bSimAnn)
     {
-        update_annealing_target_temp(ir, ir->init_t, upd ? *upd : NULL);
+        update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
     }
 
-    if (vcm != NULL)
+    if (vcm != nullptr)
     {
         *vcm = init_vcm(fplog, &mtop->groups, ir);
     }
 
-    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+    if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
     {
         if (ir->etc == etcBERENDSEN)
         {
@@ -2816,17 +2967,13 @@ void init_md(FILE *fplog,
             please_cite(fplog, "Goga2012");
         }
     }
-    if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
-    {
-        please_cite(fplog, "Caleman2008a");
-    }
     init_nrnb(nrnb);
 
     if (nfile != -1)
     {
-        *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
+        *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
 
-        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
+        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
                               mtop, ir, mdoutf_get_fp_dhdl(*outf));
     }