From 1f2be3d076837ec992907662dd714f6c011c388c Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Fri, 2 Aug 2019 14:56:23 +0200
Subject: [PATCH] Use shiftForces in ForceOuputs

The shift force buffer in t_forcerec is changed to std::vector and all
access now happens through ForceWithShiftForces.
Also removed the depreacted rvec getters from ForceOuputs.

Change-Id: I1353e6497d651a8f24e819a8826af78656889953
---
 src/gromacs/domdec/domdec.cpp                 | 61 +++++-------
 src/gromacs/domdec/domdec.h                   |  8 +-
 .../gmxlib/nonbonded/nb_free_energy.cpp       | 30 +++---
 src/gromacs/gmxlib/nonbonded/nb_free_energy.h | 10 +-
 src/gromacs/listed_forces/listed_forces.cpp   | 38 ++++---
 src/gromacs/listed_forces/listed_forces.h     |  8 +-
 src/gromacs/mdlib/force.cpp                   | 16 +--
 src/gromacs/mdlib/forcerec.cpp                |  6 +-
 src/gromacs/mdlib/qmmm.cpp                    | 25 ++---
 src/gromacs/mdlib/qmmm.h                      | 11 ++-
 src/gromacs/mdlib/sim_util.cpp                | 98 +++++++++++--------
 src/gromacs/mdtypes/forceoutput.h             | 24 +++--
 src/gromacs/mdtypes/forcerec.h                |  4 +-
 src/gromacs/nbnxm/atomdata.cpp                |  8 +-
 src/gromacs/nbnxm/atomdata.h                  |  4 +-
 src/gromacs/nbnxm/gpu_common.h                | 33 ++++---
 src/gromacs/nbnxm/kerneldispatch.cpp          | 11 ++-
 src/gromacs/nbnxm/nbnxm.cpp                   |  8 +-
 src/gromacs/nbnxm/nbnxm.h                     | 13 +--
 src/gromacs/nbnxm/nbnxm_gpu.h                 | 30 +++---
 20 files changed, 241 insertions(+), 205 deletions(-)

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index f9d642e16c..3dc77780ef 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -70,6 +70,7 @@
 #include "gromacs/mdlib/updategroups.h"
 #include "gromacs/mdlib/vsite.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/mdrunoptions.h"
 #include "gromacs/mdtypes/state.h"
@@ -406,52 +407,43 @@ void dd_move_x(gmx_domdec_t             *dd,
     wallcycle_stop(wcycle, ewcMOVEX);
 }
 
-void dd_move_f(gmx_domdec_t             *dd,
-               gmx::ArrayRef<gmx::RVec>  f,
-               rvec                     *fshift,
-               gmx_wallcycle            *wcycle)
+void dd_move_f(gmx_domdec_t              *dd,
+               gmx::ForceWithShiftForces *forceWithShiftForces,
+               gmx_wallcycle             *wcycle)
 {
     wallcycle_start(wcycle, ewcMOVEF);
 
-    int                    nzone, nat_tot;
-    gmx_domdec_comm_t     *comm;
-    gmx_domdec_comm_dim_t *cd;
-    ivec                   vis;
-    int                    is;
-    gmx_bool               bShiftForcesNeedPbc, bScrew;
+    gmx::ArrayRef<gmx::RVec> f       = forceWithShiftForces->force();
+    gmx::ArrayRef<gmx::RVec> fshift  = forceWithShiftForces->shiftForces();
 
-    comm = dd->comm;
-
-    nzone   = comm->zones.n/2;
-    nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
+    gmx_domdec_comm_t       &comm    = *dd->comm;
+    int                      nzone   = comm.zones.n/2;
+    int                      nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
     for (int d = dd->ndim-1; d >= 0; d--)
     {
         /* Only forces in domains near the PBC boundaries need to
            consider PBC in the treatment of fshift */
-        bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
-        bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
-        if (fshift == nullptr && !bScrew)
-        {
-            bShiftForcesNeedPbc = FALSE;
-        }
+        const bool shiftForcesNeedPbc = (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
+        const bool applyScrewPbc      = (shiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
         /* Determine which shift vector we need */
-        clear_ivec(vis);
-        vis[dd->dim[d]] = 1;
-        is              = IVEC2IS(vis);
+        ivec       vis   = { 0, 0, 0 };
+        vis[dd->dim[d]]  = 1;
+        const int  is    = IVEC2IS(vis);
 
-        cd = &comm->cd[d];
-        for (int p = cd->numPulses() - 1; p >= 0; p--)
+        /* Loop over the pulses */
+        const gmx_domdec_comm_dim_t &cd = comm.cd[d];
+        for (int p = cd.numPulses() - 1; p >= 0; p--)
         {
-            const gmx_domdec_ind_t    &ind  = cd->ind[p];
-            DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
+            const gmx_domdec_ind_t    &ind  = cd.ind[p];
+            DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 
-            nat_tot                        -= ind.nrecv[nzone+1];
+            nat_tot                                 -= ind.nrecv[nzone+1];
 
-            DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
+            DDBufferAccess<gmx::RVec>  sendBufferAccess(comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 
             gmx::ArrayRef<gmx::RVec>   sendBuffer;
-            if (cd->receiveInPlace)
+            if (cd.receiveInPlace)
             {
                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
             }
@@ -472,7 +464,7 @@ void dd_move_f(gmx_domdec_t             *dd,
                        sendBuffer, receiveBuffer);
             /* Add the received forces */
             int n = 0;
-            if (!bShiftForcesNeedPbc)
+            if (!shiftForcesNeedPbc)
             {
                 for (int j : ind.index)
                 {
@@ -483,11 +475,8 @@ void dd_move_f(gmx_domdec_t             *dd,
                     n++;
                 }
             }
-            else if (!bScrew)
+            else if (!applyScrewPbc)
             {
-                /* fshift should always be defined if this function is
-                 * called when bShiftForcesNeedPbc is true */
-                assert(nullptr != fshift);
                 for (int j : ind.index)
                 {
                     for (int d = 0; d < DIM; d++)
@@ -510,7 +499,7 @@ void dd_move_f(gmx_domdec_t             *dd,
                     f[j][XX] += receiveBuffer[n][XX];
                     f[j][YY] -= receiveBuffer[n][YY];
                     f[j][ZZ] -= receiveBuffer[n][ZZ];
-                    if (fshift)
+                    if (shiftForcesNeedPbc)
                     {
                         /* Add this force to the shift force */
                         for (int d = 0; d < DIM; d++)
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index d015ab31c2..08a27291fc 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -85,6 +85,7 @@ class t_state;
 
 namespace gmx
 {
+class ForceWithShiftForces;
 class MDLogger;
 class LocalAtomSetManager;
 class RangePartitioning;
@@ -217,10 +218,9 @@ void dd_move_x(struct gmx_domdec_t      *dd,
  * When fshift!=NULL the shift forces are updated to obtain
  * the correct virial from the single sum including f.
  */
-void dd_move_f(struct gmx_domdec_t      *dd,
-               gmx::ArrayRef<gmx::RVec>  f,
-               rvec                     *fshift,
-               gmx_wallcycle            *wcycle);
+void dd_move_f(struct gmx_domdec_t       *dd,
+               gmx::ForceWithShiftForces *forceWithShiftForces,
+               gmx_wallcycle             *wcycle);
 
 /*! \brief Communicate a real for each atom to the neighboring cells. */
 void dd_atom_spread_real(struct gmx_domdec_t *dd, real v[]);
diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
index dc6464a4fa..a9c5b2433d 100644
--- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
+++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
@@ -47,6 +47,7 @@
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
+#include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/utility/fatalerror.h"
@@ -114,8 +115,8 @@ template<SoftCoreTreatment softCoreTreatment, bool scLambdasOrAlphasDiffer>
 static void
 nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                       rvec * gmx_restrict              xx,
-                      rvec * gmx_restrict              ff,
-                      t_forcerec * gmx_restrict        fr,
+                      gmx::ForceWithShiftForces *      forceWithShiftForces,
+                      const t_forcerec * gmx_restrict  fr,
                       const t_mdatoms * gmx_restrict   mdatoms,
                       nb_kernel_data_t * gmx_restrict  kernel_data,
                       t_nrnb * gmx_restrict            nrnb)
@@ -158,9 +159,6 @@ nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
     const int *   typeB;
     int           ntype;
     const real *  shiftvec;
-    real *        fshift;
-    const real *  x;
-    real *        f;
     const real *  chargeA;
     const real *  chargeB;
     real          sigma6_min, sigma6_def, lam_power;
@@ -192,11 +190,13 @@ nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
     /* Extract pointer to non-bonded interaction constants */
     const interaction_const_t *ic = fr->ic;
 
-    x                   = xx[0];
-    f                   = ff[0];
+    // TODO: We should get rid of using pointers to real
+    const real          *x      = xx[0];
+    real * gmx_restrict  f      = &(forceWithShiftForces->force()[0][0]);
 
-    fshift              = fr->fshift[0];
+    real * gmx_restrict  fshift = &(forceWithShiftForces->shiftForces()[0][0]);
 
+    // Extract pair list data
     nri                 = nlist->nri;
     iinr                = nlist->iinr;
     jindex              = nlist->jindex;
@@ -814,13 +814,13 @@ nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
     inc_nrnb(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist->nri*12 + nlist->jindex[n]*150);
 }
 
-void gmx_nb_free_energy_kernel(const t_nblist   *nlist,
-                               rvec             *xx,
-                               rvec             *ff,
-                               t_forcerec       *fr,
-                               const t_mdatoms  *mdatoms,
-                               nb_kernel_data_t *kernel_data,
-                               t_nrnb           *nrnb)
+void gmx_nb_free_energy_kernel(const t_nblist            *nlist,
+                               rvec                      *xx,
+                               gmx::ForceWithShiftForces *ff,
+                               const t_forcerec          *fr,
+                               const t_mdatoms           *mdatoms,
+                               nb_kernel_data_t          *kernel_data,
+                               t_nrnb                    *nrnb)
 {
     if (fr->sc_alphacoul == 0 && fr->sc_alphavdw == 0)
     {
diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h
index 7e12ebdaea..cf4394fee4 100644
--- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2018, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,12 +45,16 @@
 #include "gromacs/mdtypes/nblist.h"
 
 struct t_forcerec;
+namespace gmx
+{
+class ForceWithShiftForces;
+}
 
 void
     gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                               rvec * gmx_restrict              xx,
-                              rvec * gmx_restrict              ff,
-                              t_forcerec * gmx_restrict        fr,
+                              gmx::ForceWithShiftForces *      forceWithShiftForces,
+                              const t_forcerec * gmx_restrict  fr,
                               const t_mdatoms * gmx_restrict   mdatoms,
                               nb_kernel_data_t * gmx_restrict  kernel_data,
                               t_nrnb * gmx_restrict            nrnb);
diff --git a/src/gromacs/listed_forces/listed_forces.cpp b/src/gromacs/listed_forces/listed_forces.cpp
index 2766c89edb..c6246a5a93 100644
--- a/src/gromacs/listed_forces/listed_forces.cpp
+++ b/src/gromacs/listed_forces/listed_forces.cpp
@@ -136,9 +136,10 @@ zero_thread_output(f_thread_t *f_t)
 
 /*! \brief Reduce thread-local force buffers */
 void
-reduce_thread_forces(int n, rvec *f,
-                     const bonded_threading_t *bt,
-                     int nthreads)
+reduce_thread_forces(int                        n,
+                     gmx::ArrayRef<gmx::RVec>   force,
+                     const bonded_threading_t  *bt,
+                     int                        nthreads)
 {
     if (nthreads > MAX_BONDED_THREADS)
     {
@@ -146,6 +147,8 @@ reduce_thread_forces(int n, rvec *f,
                   MAX_BONDED_THREADS);
     }
 
+    rvec * gmx_restrict f = as_rvec_array(force.data());
+
     /* This reduction can run on any number of threads,
      * independently of bt->nthreads.
      * But if nthreads matches bt->nthreads (which it currently does)
@@ -193,7 +196,7 @@ reduce_thread_forces(int n, rvec *f,
 
 /*! \brief Reduce thread-local forces, shift forces and energies */
 void
-reduce_thread_output(int n, rvec *f, rvec *fshift,
+reduce_thread_output(int n, gmx::ForceWithShiftForces *forceWithShiftForces,
                      real *ener, gmx_grppairener_t *grpp, real *dvdl,
                      const bonded_threading_t *bt,
                      gmx_bool bCalcEnerVir,
@@ -204,9 +207,11 @@ reduce_thread_output(int n, rvec *f, rvec *fshift,
     if (bt->nblock_used > 0)
     {
         /* Reduce the bonded force buffer */
-        reduce_thread_forces(n, f, bt, bt->nthreads);
+        reduce_thread_forces(n, forceWithShiftForces->force(), bt, bt->nthreads);
     }
 
+    rvec * gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
+
     /* When necessary, reduce energy and virial using one thread only */
     if (bCalcEnerVir && bt->nthreads > 1)
     {
@@ -361,6 +366,7 @@ calcBondedForces(const t_idef     *idef,
                  const t_forcerec *fr,
                  const t_pbc      *pbc_null,
                  const t_graph    *g,
+                 rvec             *fshiftMasterBuffer,
                  gmx_enerdata_t   *enerd,
                  t_nrnb           *nrnb,
                  const real       *lambda,
@@ -394,7 +400,7 @@ calcBondedForces(const t_idef     *idef,
              */
             if (thread == 0)
             {
-                fshift = fr->fshift;
+                fshift = fshiftMasterBuffer;
                 epot   = enerd->term;
                 grpp   = &enerd->grpp;
                 dvdlt  = dvdl;
@@ -452,8 +458,7 @@ void calc_listed(const t_commrec             *cr,
                  struct gmx_wallcycle        *wcycle,
                  const t_idef *idef,
                  const rvec x[], history_t *hist,
-                 rvec f[],
-                 gmx::ForceWithVirial *forceWithVirial,
+                 gmx::ForceOutputs *forceOutputs,
                  const t_forcerec *fr,
                  const struct t_pbc *pbc,
                  const struct t_pbc *pbc_full,
@@ -491,13 +496,13 @@ void calc_listed(const t_commrec             *cr,
         if (idef->il[F_POSRES].nr > 0)
         {
             posres_wrapper(nrnb, idef, pbc_full, x, enerd, lambda, fr,
-                           forceWithVirial);
+                           &forceOutputs->forceWithVirial());
         }
 
         if (idef->il[F_FBPOSRES].nr > 0)
         {
             fbposres_wrapper(nrnb, idef, pbc_full, x, enerd, fr,
-                             forceWithVirial);
+                             &forceOutputs->forceWithVirial());
         }
 
         /* Do pre force calculation stuff which might require communication */
@@ -530,16 +535,20 @@ void calc_listed(const t_commrec             *cr,
 
     if (haveCpuBondeds(*fr))
     {
+        gmx::ForceWithShiftForces &forceWithShiftForces = forceOutputs->forceWithShiftForces();
+
         wallcycle_sub_start(wcycle, ewcsLISTED);
         /* The dummy array is to have a place to store the dhdl at other values
            of lambda, which will be thrown away in the end */
         real dvdl[efptNR] = {0};
-        calcBondedForces(idef, x, fr, pbc_null, g, enerd, nrnb, lambda, dvdl, md,
+        calcBondedForces(idef, x, fr, pbc_null, g,
+                         as_rvec_array(forceWithShiftForces.shiftForces().data()),
+                         enerd, nrnb, lambda, dvdl, md,
                          fcd, bCalcEnerVir, global_atom_index);
         wallcycle_sub_stop(wcycle, ewcsLISTED);
 
         wallcycle_sub_start(wcycle, ewcsLISTED_BUF_OPS);
-        reduce_thread_output(fr->natoms_force, f, fr->fshift,
+        reduce_thread_output(fr->natoms_force, &forceWithShiftForces,
                              enerd->term, &enerd->grpp, dvdl,
                              bt,
                              bCalcEnerVir,
@@ -637,8 +646,7 @@ do_force_listed(struct gmx_wallcycle        *wcycle,
                 const t_idef                *idef,
                 const rvec                   x[],
                 history_t                   *hist,
-                rvec                        *forceForUseWithShiftForces,
-                gmx::ForceWithVirial        *forceWithVirial,
+                gmx::ForceOutputs           *forceOutputs,
                 const t_forcerec            *fr,
                 const struct t_pbc          *pbc,
                 const struct t_graph        *graph,
@@ -664,7 +672,7 @@ do_force_listed(struct gmx_wallcycle        *wcycle,
         set_pbc(&pbc_full, fr->ePBC, box);
     }
     calc_listed(cr, ms, wcycle, idef, x, hist,
-                forceForUseWithShiftForces, forceWithVirial,
+                forceOutputs,
                 fr, pbc, &pbc_full,
                 graph, enerd, nrnb, lambda, md, fcd,
                 global_atom_index, flags);
diff --git a/src/gromacs/listed_forces/listed_forces.h b/src/gromacs/listed_forces/listed_forces.h
index f5bb76fd95..52a2699f3f 100644
--- a/src/gromacs/listed_forces/listed_forces.h
+++ b/src/gromacs/listed_forces/listed_forces.h
@@ -86,7 +86,7 @@ class t_state;
 
 namespace gmx
 {
-class ForceWithVirial;
+class ForceOutputs;
 }
 
 //! Type of CPU function to compute a bonded interaction.
@@ -110,8 +110,7 @@ void calc_listed(const t_commrec *cr,
                  struct gmx_wallcycle *wcycle,
                  const t_idef *idef,
                  const rvec x[], history_t *hist,
-                 rvec f[],
-                 gmx::ForceWithVirial *forceWithVirial,
+                 gmx::ForceOutputs *forceOutputs,
                  const t_forcerec *fr,
                  const struct t_pbc *pbc, const struct t_pbc *pbc_full,
                  const struct t_graph *g,
@@ -144,8 +143,7 @@ do_force_listed(struct gmx_wallcycle           *wcycle,
                 const t_idef                   *idef,
                 const rvec                      x[],
                 history_t                      *hist,
-                rvec                           *forceForUseWithShiftForces,
-                gmx::ForceWithVirial           *forceWithVirial,
+                gmx::ForceOutputs              *forceOutputs,
                 const t_forcerec               *fr,
                 const struct t_pbc             *pbc,
                 const struct t_graph           *graph,
diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp
index 0796b08076..05f5aed659 100644
--- a/src/gromacs/mdlib/force.cpp
+++ b/src/gromacs/mdlib/force.cpp
@@ -120,16 +120,14 @@ do_force_lowlevel(t_forcerec                               *fr,
                   const DDBalanceRegionHandler             &ddBalanceRegionHandler)
 {
     // TODO: Replace all uses of x by const coordinates
-    rvec *x = as_rvec_array(coordinates.paddedArrayRef().data());
+    rvec *x               = as_rvec_array(coordinates.paddedArrayRef().data());
 
-    // TODO: Add the shift forces to forceOutputs
-    rvec *forceForUseWithShiftForces = forceOutputs->f();
-    auto &forceWithVirial            = forceOutputs->forceWithVirial();
+    auto &forceWithVirial = forceOutputs->forceWithVirial();
 
     /* do QMMM first if requested */
     if (fr->bQMMM)
     {
-        enerd->term[F_EQM] = calculate_QMMM(cr, forceForUseWithShiftForces, fr);
+        enerd->term[F_EQM] = calculate_QMMM(cr, &forceOutputs->forceWithShiftForces(), fr);
     }
 
     /* Call the short range functions all in one go. */
@@ -186,7 +184,7 @@ do_force_lowlevel(t_forcerec                               *fr,
 
         do_force_listed(wcycle, box, ir->fepvals, cr, ms,
                         idef, x, hist,
-                        forceForUseWithShiftForces, &forceWithVirial,
+                        forceOutputs,
                         fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
                         DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
                         flags);
@@ -378,7 +376,8 @@ do_force_lowlevel(t_forcerec                               *fr,
             fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
                     Vlr_q, ewaldOutput.Vcorr_q, enerd->term[F_COUL_RECIP]);
             pr_rvecs(debug, 0, "vir_el_recip after corr", ewaldOutput.vir_q, DIM);
-            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
+            rvec *fshift = as_rvec_array(forceOutputs->forceWithShiftForces().shiftForces().data());
+            pr_rvecs(debug, 0, "fshift after LR Corrections", fshift, SHIFTS);
             fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
                     Vlr_lj, ewaldOutput.Vcorr_lj, enerd->term[F_LJ_RECIP]);
             pr_rvecs(debug, 0, "vir_lj_recip after corr", ewaldOutput.vir_lj, DIM);
@@ -392,7 +391,8 @@ do_force_lowlevel(t_forcerec                               *fr,
 
     if (debug)
     {
-        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
+        rvec *fshift = as_rvec_array(forceOutputs->forceWithShiftForces().shiftForces().data());
+        pr_rvecs(debug, 0, "fshift after bondeds", fshift, SHIFTS);
     }
 
 }
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 19cd25e8cf..efc756dc87 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1804,10 +1804,7 @@ void init_forcerec(FILE                             *fp,
         snew(fr->shift_vec, SHIFTS);
     }
 
-    if (fr->fshift == nullptr)
-    {
-        snew(fr->fshift, SHIFTS);
-    }
+    fr->shiftForces.resize(SHIFTS);
 
     if (fr->nbfp == nullptr)
     {
@@ -2066,7 +2063,6 @@ void done_forcerec(t_forcerec *fr, int numMolBlocks)
     sfree(fr->nbfp);
     delete fr->ic;
     sfree(fr->shift_vec);
-    sfree(fr->fshift);
     sfree(fr->ewc_t);
     tear_down_bonded_threading(fr->bondedThreading);
     GMX_RELEASE_ASSERT(fr->gpuBonded == nullptr, "Should have been deleted earlier, when used");
diff --git a/src/gromacs/mdlib/qmmm.cpp b/src/gromacs/mdlib/qmmm.cpp
index 200dacac7d..f2552fc671 100644
--- a/src/gromacs/mdlib/qmmm.cpp
+++ b/src/gromacs/mdlib/qmmm.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/mdlib/qm_mopac.h"
 #include "gromacs/mdlib/qm_orca.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
@@ -848,9 +849,9 @@ void update_QMMMrec(const t_commrec  *cr,
     }
 } /* update_QMMM_rec */
 
-real calculate_QMMM(const t_commrec  *cr,
-                    rvec              f[],
-                    const t_forcerec *fr)
+real calculate_QMMM(const t_commrec           *cr,
+                    gmx::ForceWithShiftForces *forceWithShiftForces,
+                    const t_forcerec          *fr)
 {
     real
         QMener = 0.0;
@@ -883,6 +884,8 @@ real calculate_QMMM(const t_commrec  *cr,
     /* now different procedures are carried out for one layer ONION and
      * normal QMMM on one hand and multilayer oniom on the other
      */
+    gmx::ArrayRef<gmx::RVec> fMM      = forceWithShiftForces->force();
+    gmx::ArrayRef<gmx::RVec> fshiftMM = forceWithShiftForces->shiftForces();
     if (qr->QMMMscheme == eQMMMschemenormal || qr->nrQMlayers == 1)
     {
         qm = qr->qm[0];
@@ -893,16 +896,16 @@ real calculate_QMMM(const t_commrec  *cr,
         {
             for (j = 0; j < DIM; j++)
             {
-                f[qm->indexQM[i]][j]          -= forces[i][j];
-                fr->fshift[qm->shiftQM[i]][j] += fshift[i][j];
+                fMM[qm->indexQM[i]][j]          -= forces[i][j];
+                fshiftMM[qm->shiftQM[i]][j]     += fshift[i][j];
             }
         }
         for (i = 0; i < mm->nrMMatoms; i++)
         {
             for (j = 0; j < DIM; j++)
             {
-                f[mm->indexMM[i]][j]          -= forces[qm->nrQMatoms+i][j];
-                fr->fshift[mm->shiftMM[i]][j] += fshift[qm->nrQMatoms+i][j];
+                fMM[mm->indexMM[i]][j]      -= forces[qm->nrQMatoms+i][j];
+                fshiftMM[mm->shiftMM[i]][j] += fshift[qm->nrQMatoms+i][j];
             }
 
         }
@@ -950,8 +953,8 @@ real calculate_QMMM(const t_commrec  *cr,
             {
                 for (j = 0; j < DIM; j++)
                 {
-                    f[qm->indexQM[i]][j]          -= (forces[i][j]-forces2[i][j]);
-                    fr->fshift[qm->shiftQM[i]][j] += (fshift[i][j]-fshift2[i][j]);
+                    fMM[qm->indexQM[i]][j]      -= (forces[i][j]-forces2[i][j]);
+                    fshiftMM[qm->shiftQM[i]][j] += (fshift[i][j]-fshift2[i][j]);
                 }
             }
             free(qm2);
@@ -966,8 +969,8 @@ real calculate_QMMM(const t_commrec  *cr,
         {
             for (j = 0; j < DIM; j++)
             {
-                f[qm->indexQM[i]][j]          -= forces[i][j];
-                fr->fshift[qm->shiftQM[i]][j] += fshift[i][j];
+                fMM[qm->indexQM[i]][j]      -= forces[i][j];
+                fshiftMM[qm->shiftQM[i]][j] += fshift[i][j];
             }
         }
         free(forces);
diff --git a/src/gromacs/mdlib/qmmm.h b/src/gromacs/mdlib/qmmm.h
index 78fb3b66b1..ffa98d352b 100644
--- a/src/gromacs/mdlib/qmmm.h
+++ b/src/gromacs/mdlib/qmmm.h
@@ -55,6 +55,11 @@ struct t_inputrec;
 struct t_mdatoms;
 struct t_QMMMrec;
 
+namespace gmx
+{
+class ForceWithShiftForces;
+}
+
 typedef struct {
     int                nrQMatoms;      /* total nr of QM atoms              */
     rvec              *xQM;            /* shifted to center of box          */
@@ -138,9 +143,9 @@ void update_QMMMrec(const t_commrec  *cr,
  * routine should be called at every step, since it updates the MM
  * elements of the t_QMMMrec struct.
  */
-real calculate_QMMM(const t_commrec  *cr,
-                    rvec              f[],
-                    const t_forcerec *fr);
+real calculate_QMMM(const t_commrec           *cr,
+                    gmx::ForceWithShiftForces *forceWithShiftForces,
+                    const t_forcerec          *fr);
 
 /* QMMM computes the QM forces. This routine makes either function
  * calls to gmx QM routines (derived from MOPAC7 (semi-emp.) and MPQC
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index fece45af89..63eb82767f 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -139,17 +139,20 @@ static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
     }
 }
 
-static void calc_virial(int start, int homenr, const rvec x[], const rvec f[],
+static void calc_virial(int start, int homenr, const rvec x[],
+                        const gmx::ForceWithShiftForces &forceWithShiftForces,
                         tensor vir_part, const t_graph *graph, const matrix box,
                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 {
     /* The short-range virial from surrounding boxes */
-    calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
+    const rvec *fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
+    calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, ePBC == epbcSCREW, box);
     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 
     /* Calculate partial virial, for local atoms only, based on short range.
      * Total virial is computed in global_stat, called from do_md
      */
+    const rvec *f = as_rvec_array(forceWithShiftForces.force().data());
     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 
@@ -267,7 +270,7 @@ static void post_process_forces(const t_commrec           *cr,
                                 const gmx_vsite_t         *vsite,
                                 int                        flags)
 {
-    rvec *f = forceOutputs->f();
+    rvec *f = as_rvec_array(forceOutputs->forceWithShiftForces().force().data());
 
     if (fr->haveDirectVirialContributions)
     {
@@ -644,9 +647,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
  *
  * \param[in]     nbv              Nonbonded verlet structure
  * \param[in,out] pmedata          PME module data
- * \param[in,out] force            Force array to reduce task outputs into.
- * \param[in,out] forceWithVirial  Force and virial buffers
- * \param[in,out] fshift           Shift force output vector results are reduced into
+ * \param[in,out] forceOutputs     Output buffer for the forces and virial
  * \param[in,out] enerd            Energy data structure results are reduced into
  * \param[in]     flags            Force flags
  * \param[in]     pmeFlags         PME flags
@@ -654,9 +655,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
  */
 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
                                         gmx_pme_t                           *pmedata,
-                                        gmx::ArrayRefWithPadding<gmx::RVec> *force,
-                                        gmx::ForceWithVirial                *forceWithVirial,
-                                        rvec                                 fshift[],
+                                        gmx::ForceOutputs                   *forceOutputs,
                                         gmx_enerdata_t                      *enerd,
                                         int                                  flags,
                                         int                                  pmeFlags,
@@ -666,14 +665,18 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
     bool isNbGpuDone  = false;
 
 
-    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+    gmx::ForceWithShiftForces      &forceWithShiftForces = forceOutputs->forceWithShiftForces();
+    gmx::ForceWithVirial           &forceWithVirial      = forceOutputs->forceWithVirial();
+
+    gmx::ArrayRef<const gmx::RVec>  pmeGpuForces;
 
     while (!isPmeGpuDone || !isNbGpuDone)
     {
         if (!isPmeGpuDone)
         {
             GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, forceWithVirial, enerd, completionType);
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, &forceWithVirial, enerd, completionType);
         }
 
         if (!isNbGpuDone)
@@ -685,7 +688,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                                                      Nbnxm::AtomLocality::Local,
                                                      enerd->grpp.ener[egLJSR].data(),
                                                      enerd->grpp.ener[egCOULSR].data(),
-                                                     fshift, completionType);
+                                                     forceWithShiftForces.shiftForces(), completionType);
             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
             // To get the call count right, when the task finished we
             // issue a start/stop.
@@ -697,7 +700,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
                 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                              as_rvec_array(force->unpaddedArrayRef().data()));
+                                              forceWithShiftForces.force());
             }
         }
     }
@@ -726,13 +729,14 @@ setupForceOutputs(t_forcerec                          *fr,
 {
     wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
 
-    // TODO: Add the shift force buffer and use it
-    gmx::ForceWithShiftForces forceWithShiftForces(force, doVirial, gmx::ArrayRef<gmx::RVec>());
+    /* NOTE: We assume fr->shiftForces is all zeros here */
+    gmx::ForceWithShiftForces forceWithShiftForces(force, doVirial, fr->shiftForces);
 
     if (bDoForces)
     {
         /* Clear the short- and long-range forces */
-        clear_rvecs_omp(fr->natoms_force_constr, forceWithShiftForces.f());
+        clear_rvecs_omp(fr->natoms_force_constr,
+                        as_rvec_array(forceWithShiftForces.force().data()));
     }
 
     /* If we need to compute the virial, we might need a separate
@@ -1263,7 +1267,12 @@ void do_force(FILE                                     *fplog,
 
     /* Reset energies */
     reset_enerdata(enerd);
-    clear_rvecs(SHIFTS, fr->fshift);
+    /* Clear the shift forces */
+    // TODO: This should be linked to the shift force buffer in use, or cleared before use instead
+    for (gmx::RVec &elem : fr->shiftForces)
+    {
+        elem = { 0.0_real, 0.0_real, 0.0_real };
+    }
 
     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
@@ -1283,8 +1292,9 @@ void do_force(FILE                                     *fplog,
      */
     wallcycle_start(wcycle, ewcFORCE);
 
-    // set up and clear force outputs
-    ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, force, bDoForces,
+    // Set up and clear force outputs.
+    // We use std::move to keep the compiler happy, it has no effect.
+    ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, std::move(force), bDoForces,
                                               ((flags & GMX_FORCE_VIRIAL) != 0), wcycle);
 
     /* We calculate the non-bonded forces, when done on the CPU, here.
@@ -1307,14 +1317,14 @@ void do_force(FILE                                     *fplog,
          * Happens here on the CPU both with and without GPU.
          */
         nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
-                                      fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f(), *mdatoms,
+                                      fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
                                       inputrec->fepvals, lambda.data(),
                                       enerd, flags, nrnb);
 
         if (havePPDomainDecomposition(cr))
         {
             nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
-                                          fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f(), *mdatoms,
+                                          fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
                                           inputrec->fepvals, lambda.data(),
                                           enerd, flags, nrnb);
         }
@@ -1333,8 +1343,7 @@ void do_force(FILE                                     *fplog,
          * communication with calculation with domain decomposition.
          */
         wallcycle_stop(wcycle, ewcFORCE);
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f());
-
+        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.forceWithShiftForces().force());
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
         /* If there are multiple fshift output buffers we need to reduce them */
@@ -1342,8 +1351,8 @@ void do_force(FILE                                     *fplog,
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat.get(),
-                                                     fr->fshift);
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat,
+                                                     forceOut.forceWithShiftForces().shiftForces());
         }
     }
 
@@ -1377,6 +1386,9 @@ void do_force(FILE                                     *fplog,
     float cycles_wait_gpu = 0;
     if (bUseOrEmulGPU)
     {
+        auto &forceWithShiftForces = forceOut.forceWithShiftForces();
+        rvec *f                    = as_rvec_array(forceWithShiftForces.force().data());
+
         /* wait for non-local forces (or calculate in emulation mode) */
         if (havePPDomainDecomposition(cr))
         {
@@ -1387,7 +1399,7 @@ void do_force(FILE                                     *fplog,
                                             flags, Nbnxm::AtomLocality::NonLocal,
                                             enerd->grpp.ener[egLJSR].data(),
                                             enerd->grpp.ener[egCOULSR].data(),
-                                            fr->fshift);
+                                            forceWithShiftForces.shiftForces());
                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
             }
             else
@@ -1400,26 +1412,25 @@ void do_force(FILE                                     *fplog,
 
             if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
             {
-                nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
+                nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
             }
 
             // flag to specify if forces should be accumulated in force buffer
             // ops. For non-local part, this just depends on whether CPU forces are present.
             bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
             nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                          forceWithShiftForces.force(), pme_gpu_get_device_f(fr->pmedata),
                                           pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                           useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
-
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
-                nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
+                nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
             }
 
             if (fr->nbv->emulateGpu() && (flags & GMX_FORCE_VIRIAL))
             {
-                nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat.get(),
-                                                         fr->fshift);
+                nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat,
+                                                         forceWithShiftForces.shiftForces());
             }
         }
     }
@@ -1439,7 +1450,7 @@ void do_force(FILE                                     *fplog,
             {
                 nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
             }
-            dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
+            dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
         }
     }
 
@@ -1449,7 +1460,7 @@ void do_force(FILE                                     *fplog,
                              (useGpuFBufOps == BufferOpsUseGpu::False));
     if (alternateGpuWait)
     {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial(), fr->fshift, enerd,
+        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd,
                                     flags, pmeFlags, wcycle);
     }
 
@@ -1473,7 +1484,7 @@ void do_force(FILE                                     *fplog,
                                     flags, Nbnxm::AtomLocality::Local,
                                     enerd->grpp.ener[egLJSR].data(),
                                     enerd->grpp.ener[egCOULSR].data(),
-                                    fr->fshift);
+                                    forceOut.forceWithShiftForces().shiftForces());
         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
         if (ddBalanceRegionHandler.useBalancingRegion())
@@ -1508,6 +1519,8 @@ void do_force(FILE                                     *fplog,
      * on the non-alternating path. */
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
+        gmx::ArrayRef<gmx::RVec>  force = forceOut.forceWithShiftForces().force();
+        rvec                     *f     = as_rvec_array(force.data());
 
         // TODO: move these steps as early as possible:
         // - CPU f H2D should be as soon as all CPU-side forces are done
@@ -1516,7 +1529,7 @@ void do_force(FILE                                     *fplog,
         //
         if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr)))
         {
-            nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
+            nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
         }
         // flag to specify if forces should be accumulated in force
         // buffer ops. For local part, this depends on whether CPU
@@ -1526,13 +1539,12 @@ void do_force(FILE                                     *fplog,
         bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
             (haveCpuForces || DOMAINDECOMP(cr));
         nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                      force, pme_gpu_get_device_f(fr->pmedata),
                                       pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                       useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
-
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
-            nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
+            nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
         }
     }
@@ -1551,19 +1563,23 @@ void do_force(FILE                                     *fplog,
 
     if (bDoForces)
     {
+        rvec *f = as_rvec_array(forceOut.forceWithShiftForces().force().data());
+
         /* If we have NoVirSum forces, but we do not calculate the virial,
          * we sum fr->f_novirsum=forceOut.f later.
          */
         if (vsite && !(fr->haveDirectVirialContributions && !(flags & GMX_FORCE_VIRIAL)))
         {
-            spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f(), fr->fshift, FALSE, nullptr, nrnb,
+            rvec *fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
+            spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE, nullptr, nrnb,
                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
         }
 
         if (flags & GMX_FORCE_VIRIAL)
         {
             /* Calculation of the virial must be done after vsites! */
-            calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f(),
+            calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
+                        forceOut.forceWithShiftForces(),
                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
         }
     }
diff --git a/src/gromacs/mdtypes/forceoutput.h b/src/gromacs/mdtypes/forceoutput.h
index d43cb69fa6..c81319bfb8 100644
--- a/src/gromacs/mdtypes/forceoutput.h
+++ b/src/gromacs/mdtypes/forceoutput.h
@@ -83,10 +83,16 @@ class ForceWithShiftForces
             computeVirial_(computeVirial),
             shiftForces_(shiftForces) {}
 
-        //! Returns a, deprecated, rvec pointer to the force buffer
-        rvec *f()
+        //! Returns an arrayref to the force buffer without padding
+        gmx::ArrayRef<gmx::RVec> force()
         {
-            return as_rvec_array(force_.paddedArrayRef().data());
+            return force_.unpaddedArrayRef();
+        }
+
+        //! Returns a const arrayref to the force buffer without padding
+        gmx::ArrayRef<const gmx::RVec> force() const
+        {
+            return force_.unpaddedConstArrayRef();
         }
 
         //! Returns whether the virial needs to be computed
@@ -101,6 +107,12 @@ class ForceWithShiftForces
             return shiftForces_;
         }
 
+        //! Returns a const shift forces buffer
+        gmx::ArrayRef<const gmx::RVec> shiftForces() const
+        {
+            return shiftForces_;
+        }
+
     private:
         //! The force buffer
         gmx::ArrayRefWithPadding<gmx::RVec> force_;
@@ -203,10 +215,10 @@ class ForceOutputs
             forceWithShiftForces_(forceWithShiftForces),
             forceWithVirial_(forceWithVirial) {}
 
-        //! Returns a, deprecated, rvec pointer to the force buffer for use with shift forces
-        rvec *f()
+        //! Returns a reference to the force with shift forces object
+        ForceWithShiftForces &forceWithShiftForces()
         {
-            return forceWithShiftForces_.f();
+            return forceWithShiftForces_;
         }
 
         //! Returns a reference to the force with virial object
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index ef706b94b1..9849889571 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -231,8 +231,8 @@ struct t_forcerec { // NOLINT (clang-analyzer-optin.performance.Padding)
     /* PME/Ewald stuff */
     struct gmx_ewald_tab_t *ewald_table = nullptr;
 
-    /* Shift force array for computing the virial */
-    rvec *fshift = nullptr;
+    /* Shift force array for computing the virial, size SHIFTS */
+    std::vector<gmx::RVec> shiftForces;
 
     /* Non bonded Parameter lists */
     int      ntype        = 0; /* Number of atom types */
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index dd0ccb2ce7..41caa8d4b2 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1532,10 +1532,10 @@ void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                          bool                          useGpuFPmeReduction,
                          bool                          accumulateForce);
 
-void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
-                                              rvec                   *fshift)
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
+                                              gmx::ArrayRef<gmx::RVec>  fshift)
 {
-    gmx::ArrayRef<const nbnxn_atomdata_output_t> outputBuffers = nbat->out;
+    gmx::ArrayRef<const nbnxn_atomdata_output_t> outputBuffers = nbat.out;
 
     for (int s = 0; s < SHIFTS; s++)
     {
@@ -1547,7 +1547,7 @@ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
             sum[YY] += out.fshift[s*DIM+YY];
             sum[ZZ] += out.fshift[s*DIM+ZZ];
         }
-        rvec_inc(fshift[s], sum);
+        fshift[s] += sum;
     }
 }
 
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index e34c8bd558..6bbe082c0b 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -374,8 +374,8 @@ void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                          bool                          accumulateForce);
 
 /* Add the fshift force stored in nbat to fshift */
-void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
-                                              rvec                   *fshift);
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
+                                              gmx::ArrayRef<gmx::RVec>  fshift);
 
 /* Get the atom start index and number of atoms for a given locality */
 void nbnxn_get_atom_range(Nbnxm::AtomLocality              atomLocality,
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index 4f4edddbeb..fc5977c7c3 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -365,13 +365,13 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 
 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 //NOLINTNEXTLINE(misc-definitions-in-headers)
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
-                         const int           flags,
-                         const AtomLocality  aloc,
-                         real               *e_lj,
-                         real               *e_el,
-                         rvec               *fshift,
-                         GpuTaskCompletion   completionKind)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
+                         const int                 flags,
+                         const AtomLocality        aloc,
+                         real                     *e_lj,
+                         real                     *e_el,
+                         gmx::ArrayRef<gmx::RVec>  shiftForces,
+                         GpuTaskCompletion         completionKind)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -404,7 +404,8 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
                                nb->bDoTime != 0);
 
-        gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
+        gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift,
+                                  e_lj, e_el, as_rvec_array(shiftForces.data()));
     }
 
     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
@@ -429,17 +430,17 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
  * \param[in] aloc Atom locality identifier
  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
- * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[out] shiftForces Shift forces buffer to accumulate into
  */
 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
-                          int              flags,
-                          AtomLocality     aloc,
-                          real            *e_lj,
-                          real            *e_el,
-                          rvec            *fshift)
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t          *nb,
+                          int                       flags,
+                          AtomLocality              aloc,
+                          real                     *e_lj,
+                          real                     *e_el,
+                          gmx::ArrayRef<gmx::RVec>  shiftForces)
 {
-    gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
+    gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
                         GpuTaskCompletion::Wait);
 }
 
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp
index a600029a2b..e02b649739 100644
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -45,6 +45,7 @@
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/md_enums.h"
@@ -518,9 +519,9 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
 
 void
 nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocality,
-                                             t_forcerec                 *fr,
+                                             const t_forcerec           *fr,
                                              rvec                        x[],
-                                             rvec                        f[],
+                                             gmx::ForceWithShiftForces  *forceWithShiftForces,
                                              const t_mdatoms            &mdatoms,
                                              t_lambda                   *fepvals,
                                              real                       *lambda,
@@ -572,7 +573,8 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
         try
         {
             gmx_nb_free_energy_kernel(nbl_fep[th].get(),
-                                      x, f, fr, &mdatoms, &kernel_data, nrnb);
+                                      x, forceWithShiftForces,
+                                      fr, &mdatoms, &kernel_data, nrnb);
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
@@ -613,7 +615,8 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
                 try
                 {
                     gmx_nb_free_energy_kernel(nbl_fep[th].get(),
-                                              x, f, fr, &mdatoms, &kernel_data, nrnb);
+                                              x, forceWithShiftForces,
+                                              fr, &mdatoms, &kernel_data, nrnb);
                 }
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index dfe8c4e936..3e4990a6ac 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -160,7 +160,7 @@ gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
 
 void
 nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
-                                             rvec                               *f)
+                                             gmx::ArrayRef<gmx::RVec>            force)
 {
 
     /* Skip the reduction if there was no short-range GPU work to do
@@ -173,7 +173,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
-    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), f, nullptr, nullptr, gpu_nbv, false, false);
+    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), nullptr, nullptr, gpu_nbv, false, false);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
@@ -181,7 +181,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
 
 void
 nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
-                                             rvec                               *f,
+                                             gmx::ArrayRef<gmx::RVec>            force,
                                              void                               *fPmeDeviceBuffer,
                                              GpuEventSynchronizer               *pmeForcesReady,
                                              BufferOpsUseGpu                     useGpu,
@@ -206,7 +206,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
     auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
-    fn(nbat.get(), locality, pairSearch_->gridSet(), f, fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+    fn(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index cf0b2f62ab..86394f02ae 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -139,6 +139,7 @@ class GpuEventSynchronizer;
 
 namespace gmx
 {
+class ForceWithShiftForces;
 class MDLogger;
 class UpdateGroupsCog;
 }
@@ -293,9 +294,9 @@ struct nonbonded_verlet_t
 
         //! Executes the non-bonded free-energy kernel, always runs on the CPU
         void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocality,
-                                      t_forcerec                 *fr,
+                                      const t_forcerec           *fr,
                                       rvec                        x[],
-                                      rvec                        f[],
+                                      gmx::ForceWithShiftForces  *forceWithShiftForces,
                                       const t_mdatoms            &mdatoms,
                                       t_lambda                   *fepvals,
                                       real                       *lambda,
@@ -305,14 +306,14 @@ struct nonbonded_verlet_t
 
         /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat
          * \param [in] locality         Local or non-local
-         * \param [inout] f             Force to be added to
+         * \param [inout] force         Force to be added to
          */
         void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
-                                      rvec                               *f);
+                                      gmx::ArrayRef<gmx::RVec>            force);
 
         /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
          * \param [in] locality         Local or non-local
-         * \param [inout] f             Force to be added to
+         * \param [inout] force         Force to be added to
          * \param [in] fPme             Force from PME calculation
          * \param [in] pmeForcesReady   Event triggered when PME force calculation has completed
          * \param [in] useGpu           Whether GPU buffer ops are active
@@ -320,7 +321,7 @@ struct nonbonded_verlet_t
          * \param [in] accumulateForce  Whether force should be accumulated or stored
          */
         void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
-                                      rvec                               *f,
+                                      gmx::ArrayRef<gmx::RVec>            force,
                                       void                               *fPme,
                                       GpuEventSynchronizer               *pmeForcesReady,
                                       BufferOpsUseGpu                     useGpu,
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 0f0e8de218..46b1dc4c50 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -174,18 +174,18 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
  * \param[in]  aloc   Atom locality identifier
  * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
  * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
- * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[out] shiftForces    Shift forces buffer to accumulate into
  * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
  * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
  */
 GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
-                         int             gmx_unused   flags,
-                         AtomLocality    gmx_unused   aloc,
-                         real            gmx_unused  *e_lj,
-                         real            gmx_unused  *e_el,
-                         rvec            gmx_unused  *fshift,
-                         GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false);
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
+                         int                      gmx_unused  flags,
+                         AtomLocality             gmx_unused  aloc,
+                         real                     gmx_unused *e_lj,
+                         real                     gmx_unused *e_el,
+                         gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
+                         GpuTaskCompletion        gmx_unused  completionKind) GPU_FUNC_TERM_WITH_RETURN(false);
 
 /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
  * transfers to finish.
@@ -199,15 +199,15 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
  * \param[in] aloc Atom locality identifier
  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
- * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[out] shiftForces Shift forces buffer to accumulate into
  */
 GPU_FUNC_QUALIFIER
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
-                          int             gmx_unused  flags,
-                          AtomLocality    gmx_unused  aloc,
-                          real            gmx_unused *e_lj,
-                          real            gmx_unused *e_el,
-                          rvec            gmx_unused *fshift) GPU_FUNC_TERM;
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
+                          int                      gmx_unused  flags,
+                          AtomLocality             gmx_unused  aloc,
+                          real                     gmx_unused *e_lj,
+                          real                     gmx_unused *e_el,
+                          gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces) GPU_FUNC_TERM;
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 GPU_FUNC_QUALIFIER
-- 
2.22.0