From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 11 Sep 2019 12:19:31 +0000 (+0200)
Subject: Decouple GPU force buffer management from buffer ops in NBNXM
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=873bf0800f52e3d6c5998dae27e64f1f6e18235a;p=alexxy%2Fgromacs.git

Decouple GPU force buffer management from buffer ops in NBNXM

When GPU-side buffer operations are used, the total forces on the
device are accumulated in NBNXM module in the local GPU buffer.
By decoupling the CPU and GPU buffer operations and making the
force buffer into an argument for the reduction function, this
commit allows to take the responsibility of the GPU forces
management from the NBNXM module to the third-party instance.

This commit is refactoring of the code in preparation for the
introduction of the GPU-side PropagatorStateData object.

TODO: Use DeviceBuffer when passing the PME GPU forces buffer.

Refs. #2816

Change-Id: I2a1f9d12fad3fb5b2ce37ca3ed3d0cb91777c468
---

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 853f97b021..768ca45777 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1455,14 +1455,22 @@ void do_force(FILE                                     *fplog,
             // flag to specify if forces should be accumulated in force buffer
             // ops. For non-local part, this just depends on whether CPU forces are present.
             bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
-            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceWithShiftForces.force(), pme_gpu_get_device_f(fr->pmedata),
-                                          pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                          useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
+                nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
+                                                  nbv->getDeviceForces(),
+                                                  pme_gpu_get_device_f(fr->pmedata),
+                                                  pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                                  useGpuFPmeReduction, accumulateForce);
                 nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
             }
+            else
+            {
+                nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
+                                              forceWithShiftForces.force());
+            }
+
 
             if (fr->nbv->emulateGpu() && forceFlags.computeVirial)
             {
@@ -1574,15 +1582,22 @@ void do_force(FILE                                     *fplog,
         // non-local part).
         bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
             (haveCpuForces || DOMAINDECOMP(cr));
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      force, pme_gpu_get_device_f(fr->pmedata),
-                                      pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                      useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
+            nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
+                                              nbv->getDeviceForces(),
+                                              pme_gpu_get_device_f(fr->pmedata),
+                                              pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                              useGpuFPmeReduction, accumulateForce);
             nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
         }
+        else
+        {
+            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, force);
+        }
+
     }
 
     launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd,
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 6424b1558e..0561462633 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1472,16 +1472,10 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
 
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
-template <bool  useGpu>
 void reduceForces(nbnxn_atomdata_t                *nbat,
                   const Nbnxm::AtomLocality        locality,
                   const Nbnxm::GridSet            &gridSet,
-                  rvec                            *f,
-                  void                            *pmeFDeviceBuffer,
-                  GpuEventSynchronizer            *pmeForcesReady,
-                  gmx_nbnxn_gpu_t                 *gpu_nbv,
-                  bool                             useGpuFPmeReduction,
-                  bool                             accumulateForce)
+                  rvec                            *f)
 {
     int a0 = 0;
     int na = 0;
@@ -1494,75 +1488,77 @@ void reduceForces(nbnxn_atomdata_t                *nbat,
         return;
     }
 
-    if (useGpu)
-    {
-        Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
-                                         gpu_nbv,
-                                         pmeFDeviceBuffer,
-                                         pmeForcesReady,
-                                         a0, na,
-                                         useGpuFPmeReduction,
-                                         accumulateForce);
-    }
-    else
-    {
-        int nth = gmx_omp_nthreads_get(emntNonbonded);
+    int nth = gmx_omp_nthreads_get(emntNonbonded);
 
-        if (nbat->out.size() > 1)
+    if (nbat->out.size() > 1)
+    {
+        if (locality != Nbnxm::AtomLocality::All)
         {
-            if (locality != Nbnxm::AtomLocality::All)
-            {
-                gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
-            }
+            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+        }
 
-            /* Reduce the force thread output buffers into buffer 0, before adding
-             * them to the, differently ordered, "real" force buffer.
-             */
-            if (nbat->bUseTreeReduce)
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
-            }
-            else
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
-            }
+        /* Reduce the force thread output buffers into buffer 0, before adding
+         * them to the, differently ordered, "real" force buffer.
+         */
+        if (nbat->bUseTreeReduce)
+        {
+            nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
         }
+        else
+        {
+            nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+        }
+    }
 #pragma omp parallel for num_threads(nth) schedule(static)
-        for (int th = 0; th < nth; th++)
+    for (int th = 0; th < nth; th++)
+    {
+        try
         {
-            try
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
-                                                    nbat->out[0],
-                                                    a0 + ((th + 0)*na)/nth,
-                                                    a0 + ((th + 1)*na)/nth,
-                                                    f);
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+            nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
+                                                nbat->out[0],
+                                                a0 + ((th + 0)*na)/nth,
+                                                a0 + ((th + 1)*na)/nth,
+                                                f);
         }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    }
+}
+
+/* Add the force array(s) from nbnxn_atomdata_t to f */
+void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
+                     DeviceBuffer<float>              totalForcesDevice,
+                     const Nbnxm::GridSet            &gridSet,
+                     void                            *pmeForcesDevice,
+                     GpuEventSynchronizer            *pmeForcesReady,
+                     gmx_nbnxn_gpu_t                 *gpu_nbv,
+                     bool                             useGpuFPmeReduction,
+                     bool                             accumulateForce)
+{
+    int atomsStart = 0;
+    int numAtoms   = 0;
+
+    nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms);
+
+    if (numAtoms == 0)
+    {
+        /* The are no atoms for this reduction, avoid some overhead */
+        return;
     }
+
+    Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
+                                     totalForcesDevice,
+                                     gpu_nbv,
+                                     pmeForcesDevice,
+                                     pmeForcesReady,
+                                     atomsStart, numAtoms,
+                                     useGpuFPmeReduction,
+                                     accumulateForce);
+}
+
+DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv);
 }
-template
-void reduceForces<true>(nbnxn_atomdata_t             *nbat,
-                        const Nbnxm::AtomLocality     locality,
-                        const Nbnxm::GridSet         &gridSet,
-                        rvec                         *f,
-                        void                         *fpme,
-                        GpuEventSynchronizer         *pmeForcesReady,
-                        gmx_nbnxn_gpu_t              *gpu_nbv,
-                        bool                          useGpuFPmeReduction,
-                        bool                          accumulateForce);
-
-template
-void reduceForces<false>(nbnxn_atomdata_t             *nbat,
-                         const Nbnxm::AtomLocality     locality,
-                         const Nbnxm::GridSet         &gridSet,
-                         rvec                         *f,
-                         void                         *fpme,
-                         GpuEventSynchronizer         *pmeForcesReady,
-                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                         bool                          useGpuFPmeReduction,
-                         bool                          accumulateForce);
 
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                               gmx::ArrayRef<gmx::RVec>  fshift)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 488290bb5b..ba644bd436 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -367,40 +367,47 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                     gmx_nbnxn_gpu_t          *gpu_nbv,
                                     DeviceBuffer<float>       coordinatesDevice);
 
-//! Add the computed forces to \p f, an internal reduction might be performed as well
-template <bool  useGpu>
+/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
+ *
+ * \param[in]  nbat        Atom data in NBNXM format.
+ * \param[in]  locality    If the reduction should be performed on local or non-local atoms.
+ * \param[in]  gridSet     The grids data.
+ * \param[out] totalForce  Buffer to accumulate resulting force
+ */
 void reduceForces(nbnxn_atomdata_t                   *nbat,
                   Nbnxm::AtomLocality                 locality,
                   const Nbnxm::GridSet               &gridSet,
-                  rvec                               *f,
-                  void                               *pmeFDeviceBuffer,
-                  GpuEventSynchronizer               *pmeForcesReady,
-                  gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  bool                                useGpuFPmeReduction,
-                  bool                                accumulateForce);
-
-
-extern template
-void reduceForces<true>(nbnxn_atomdata_t             *nbat,
-                        const Nbnxm::AtomLocality     locality,
-                        const Nbnxm::GridSet         &gridSet,
-                        rvec                         *f,
-                        void                         *pmeFDeviceBuffer,
-                        GpuEventSynchronizer         *pmeForcesReady,
-                        gmx_nbnxn_gpu_t              *gpu_nbv,
-                        bool                          useGpuFPmeReduction,
-                        bool                          accumulateForce);
-
-extern template
-void reduceForces<false>(nbnxn_atomdata_t             *nbat,
-                         const Nbnxm::AtomLocality     locality,
-                         const Nbnxm::GridSet         &gridSet,
-                         rvec                         *f,
-                         void                         *pmeFDeviceBuffer,
-                         GpuEventSynchronizer         *pmeForcesReady,
-                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                         bool                          useGpuFPmeReduction,
-                         bool                          accumulateForce);
+                  rvec                               *totalForce);
+
+/*! \brief Reduce forces on the GPU
+ *
+ * \param[in]  locality             If the reduction should be performed on local or non-local atoms.
+ * \param[out] totalForcesDevice    Device buffer to accumulate resulting force.
+ * \param[in]  gridSet              The grids data.
+ * \param[in]  pmeForcesDevice      Device buffer with PME forces.
+ * \param[in]  pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]  gpu_nbv              The NBNXM GPU data structure.
+ * \param[in]  useGpuFPmeReduction  Whether PME forces should be added.
+ * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
+ */
+void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
+                     DeviceBuffer<float>                 totalForcesDevice,
+                     const Nbnxm::GridSet               &gridSet,
+                     void                               *pmeForcesDevice,
+                     GpuEventSynchronizer               *pmeForcesReady,
+                     gmx_nbnxn_gpu_t                    *gpu_nbv,
+                     bool                                useGpuFPmeReduction,
+                     bool                                accumulateForce);
+
+/*!\brief Getter for the GPU forces buffer
+ *
+ * \todo Will be removed when the buffer management is lifted out of the NBNXM
+ *
+ * \param[in] gpu_nbv  The NBNXM GPU data structure.
+ *
+ * \returns Device forces buffer
+ */
+DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
 
 /* Add the fshift force stored in nbat to fshift */
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
index bc0d2c2c5e..3c8f7b1cb3 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -137,56 +137,59 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
 
 }
 
-/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
+/*! \brief CUDA kernel to sum up the force components
  *
- * \param[in]     fnb              Force in nbat format
- * \param[in]     fPmeDeviceBuffer PME force
- * \param[in,out] f                Force buffer to be reduced into
- * \param[in]     cell             Cell index mapping
- * \param[in]     atomStart        Start atom index
- * \param[in]     nAtoms           Number of Atoms
+ * \tparam        accumulateForce  If the initial forces in \p d_fTotal should be saved.
+ * \tparam        addPmeForce      Whether the PME force should be added to the total.
+ *
+ * \param[in]     d_fNB            Non-bonded forces in nbat format.
+ * \param[in]     d_fPme           PME forces.
+ * \param[in,out] d_fTotal         Force buffer to be reduced into.
+ * \param[in]     cell             Cell index mapping.
+ * \param[in]     atomStart        Start atom index.
+ * \param[in]     numAtoms         Number of atoms.
  */
-template <bool accumulateForce, bool addPmeF>
+template <bool accumulateForce, bool addPmeForce>
 __global__ void
-nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 const float3 *__restrict__ fPmeDeviceBuffer,
-                                 float3                   * f,
-                                 const int *__restrict__    cell,
-                                 const int                  atomStart,
-                                 const int                  nAtoms);
-template <bool accumulateForce, bool addPmeF>
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__  d_fNB,
+                                 const float3 *__restrict__  d_fPme,
+                                 float3                     *d_fTotal,
+                                 const int *__restrict__     d_cell,
+                                 const int                   atomStart,
+                                 const int                   numAtoms);
+template <bool accumulateForce, bool addPmeForce>
 __global__ void
-nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 const float3 *__restrict__ fPmeDeviceBuffer,
-                                 float3                   * f,
-                                 const int *__restrict__    cell,
-                                 const int                  atomStart,
-                                 const int                  nAtoms)
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__  d_fNB,
+                                 const float3 *__restrict__  d_fPme,
+                                 float3                     *d_fTotal,
+                                 const int *__restrict__     d_cell,
+                                 const int                   atomStart,
+                                 const int                   numAtoms)
 {
 
     /* map particle-level parallelism to 1D CUDA thread and block index */
     int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
 
     /* perform addition for each particle*/
-    if (threadIndex < nAtoms)
+    if (threadIndex < numAtoms)
     {
 
-        int     i        = cell[atomStart+threadIndex];
-        float3 *fDest    = (float3 *)&f[atomStart+threadIndex];
+        int     i        = d_cell[atomStart+threadIndex];
+        float3 *fDest    = (float3 *)&d_fTotal[atomStart+threadIndex];
         float3  temp;
 
         if (accumulateForce)
         {
             temp  = *fDest;
-            temp += fnb[i];
+            temp += d_fNB[i];
         }
         else
         {
-            temp = fnb[i];
+            temp = d_fNB[i];
         }
-        if (addPmeF)
+        if (addPmeForce)
         {
-            temp += fPmeDeviceBuffer[atomStart+threadIndex];
+            temp += d_fPme[atomStart+threadIndex];
         }
         *fDest = temp;
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 63051b7c43..6a17ee6a9b 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -859,22 +859,22 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
 
 /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
+                               DeviceBuffer<float>              totalForcesDevice,
                                gmx_nbnxn_gpu_t                 *nb,
-                               void                            *fPmeDevicePtr,
+                               void                            *pmeForcesDevice,
                                GpuEventSynchronizer            *pmeForcesReady,
                                int                              atomStart,
-                               int                              nAtoms,
+                               int                              numAtoms,
                                bool                             useGpuFPmeReduction,
                                bool                             accumulateForce)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-    cu_atomdata_t            *adat      = nb->atdat;
-    bool                      addPmeF   = useGpuFPmeReduction;
+    const InteractionLocality iLocality     = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream        = nb->stream[iLocality];
+    cu_atomdata_t            *adat          = nb->atdat;
 
-    if (addPmeF)
+    if (useGpuFPmeReduction)
     {
         //Stream must wait for PME force completion
         pmeForcesReady->enqueueWaitEvent(stream);
@@ -886,7 +886,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
     config.blockSize[0]     = c_bufOpsThreadsPerBlock;
     config.blockSize[1]     = 1;
     config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
+    config.gridSize[0]      = ((numAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
     config.gridSize[1]      = 1;
     config.gridSize[2]      = 1;
     config.sharedMemorySize = 0;
@@ -896,30 +896,35 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
         nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
         nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
 
-    if (addPmeF)
+    if (useGpuFPmeReduction)
     {
         kernelFn = accumulateForce ?
             nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
             nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
     }
 
-    const float3     *d_f     = adat->f;
-    float3           *d_fNB   = (float3*) nb->frvec;
-    const float3     *d_fPme  = (float3*) fPmeDevicePtr;
-    const int        *d_cell  = nb->cell;
+    const float3     *d_fNB    = adat->f;
+    const float3     *d_fPme   = (float3*) pmeForcesDevice;
+    float3           *d_fTotal = (float3*) totalForcesDevice;
+    const int        *d_cell   = nb->cell;
 
     const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
-                                                               &d_f,
-                                                               &d_fPme,
                                                                &d_fNB,
+                                                               &d_fPme,
+                                                               &d_fTotal,
                                                                &d_cell,
                                                                &atomStart,
-                                                               &nAtoms);
+                                                               &numAtoms);
 
     launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
 
 }
 
+DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
+{
+    return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
+}
+
 void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
                                 const Nbnxm::GridSet            &gridSet,
                                 gmx_nbnxn_gpu_t                 *nb,
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 794b184f36..68eac205a4 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -203,26 +203,22 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
-    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), nullptr, nullptr, gpu_nbv, false, false);
+    reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()));
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
 void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
-                                             gmx::ArrayRef<gmx::RVec>            force,
-                                             void                               *fPmeDeviceBuffer,
-                                             GpuEventSynchronizer               *pmeForcesReady,
-                                             BufferOpsUseGpu                     useGpu,
-                                             bool                                useGpuFPmeReduction,
-                                             bool                                accumulateForce)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality           locality,
+                                                 DeviceBuffer<float>                 totalForcesDevice,
+                                                 void                               *forcesPmeDevice,
+                                                 GpuEventSynchronizer               *pmeForcesReady,
+                                                 bool                                useGpuFPmeReduction,
+                                                 bool                                accumulateForce)
 {
 
-    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce),
-               "Accumulatation of force is only valid when GPU buffer ops are active");
-
-    GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)),
+    GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
                "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available");
 
     /* Skip the reduction if there was no short-range GPU work to do
@@ -235,8 +231,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
-    auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
-    fn(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
@@ -259,6 +254,11 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
+DeviceBuffer<float> nonbonded_verlet_t::getDeviceForces()
+{
+    return nbnxn_atomdata_get_f_gpu(gpu_nbv);
+}
+
 real nonbonded_verlet_t::pairlistInnerRadius() const
 {
     return pairlistSets_->params().rlistInner;
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 50ead8c655..db550859df 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -354,22 +354,30 @@ struct nonbonded_verlet_t
         void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
                                       gmx::ArrayRef<gmx::RVec>            force);
 
-        /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
-         * \param [in] locality         Local or non-local
-         * \param [inout] force         Force to be added to
-         * \param [in] fPme             Force from PME calculation
-         * \param [in] pmeForcesReady   Event triggered when PME force calculation has completed
-         * \param [in] useGpu           Whether GPU buffer ops are active
-         * \param [in] useGpuFPmeReduction   Whether PME force reduction is on GPU
-         * \param [in] accumulateForce  Whether force should be accumulated or stored
+        /*! \brief Add the forces stored in nbat to total force using GPU buffer opse
+         *
+         * \param [in]     locality             Local or non-local
+         * \param [in,out] totalForcesDevice    Force to be added to
+         * \param [in]     forcesPmeDevice      Device buffer with PME forces
+         * \param [in]     pmeForcesReady       Event triggered when PME force calculation has completed
+         * \param [in]     useGpuFPmeReduction  Whether PME forces should be added
+         * \param [in]     accumulateForce      If the total force buffer already contains data
          */
-        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
-                                      gmx::ArrayRef<gmx::RVec>            force,
-                                      void                               *fPme,
-                                      GpuEventSynchronizer               *pmeForcesReady,
-                                      BufferOpsUseGpu                     useGpu,
-                                      bool                                useGpuFPmeReduction,
-                                      bool                                accumulateForce);
+        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                 locality,
+                                          DeviceBuffer<float>                 totalForcesDevice,
+                                          void                               *forcesPmeDevice,
+                                          GpuEventSynchronizer               *pmeForcesReady,
+                                          bool                                useGpuFPmeReduction,
+                                          bool                                accumulateForce);
+
+        /*!\brief Getter for the GPU force buffer.
+         *
+         * \todo This function will be removed in future patches as the management of the device buffers
+         *       is moved to a separate object.
+         *
+         * \returns The force buffer in plain rvec format.
+         */
+        DeviceBuffer<float> getDeviceForces();
 
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 4dfcfa15d4..8721e40da6 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -324,17 +324,44 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
                                     gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
                                     int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
 
-/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */
+/*! \brief Force buffer operations on GPU.
+ *
+ * Transforms non-bonded forces into plain rvec format and add all the force components to the total
+ * force buffer
+ *
+ * \param[in]   atomLocality         If the reduction should be performed on local or non-local atoms.
+ * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
+ * \param[in]   gpu_nbv              The NBNXM GPU data structure.
+ * \param[in]   pmeForcesDevice      Device buffer with PME forces.
+ * \param[in]   pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]   atomStart            Index of the first atom to reduce forces for.
+ * \param[in]   numAtoms             Number of atoms to reduce forces for.
+ * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
+ * \param[in]   accumulateForce      Whether there are usefull data already in the total force buffer.
+ *
+ */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLocality,
+                               DeviceBuffer<float>          gmx_unused  totalForcesDevice,
                                gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
-                               void                         gmx_unused *fPmeDevicePtr,
+                               void                         gmx_unused *pmeForcesDevice,
                                GpuEventSynchronizer         gmx_unused *pmeForcesReady,
                                int                          gmx_unused  atomStart,
-                               int                          gmx_unused  nAtoms,
+                               int                          gmx_unused  numAtoms,
                                bool                         gmx_unused  useGpuFPmeReduction,
                                bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
+/*! \brief Getter for the device coordinates buffer.
+ *
+ * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+ *
+ * \param[in]  gpu_nbv  The nonbonded data GPU structure.
+ *
+ * \returns Device coordinates buffer in plain rvec format.
+ */
+CUDA_FUNC_QUALIFIER
+DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+
 /*! \brief Copy force buffer from CPU to GPU */
 CUDA_FUNC_QUALIFIER
 void nbnxn_launch_copy_f_to_gpu(AtomLocality            gmx_unused  atomLocality,