From 8e83edea6ef6a77c059f913c77eb3834f87f6abc Mon Sep 17 00:00:00 2001
From: Alan Gray <alang@nvidia.com>
Date: Fri, 8 Mar 2019 01:05:47 -0800
Subject: [PATCH] F buffer operations in CUDA

This patch performs GPU buffer ops for force buffers.

Enable with GMX_USE_GPU_BUFFER_OPS env variable.

Currently, the H2D transfer of the force buffer is switched on with
haveSpecialForces || haveCpuBondedWork || haveCpuPmeWork,
where haveCpuPmeWork is true even when useGpuPme == true
until on-GPU PME-nonbonded reduction is added in follow-up.

TODO: enable PME reduction in GPU buffer ops and remove associated H2D
transfer

Implements part of #2817

Change-Id: Ice984425301d24bac1340e883698244489cd686e
---
 src/gromacs/mdlib/sim_util.cpp                | 100 +++++++++---
 src/gromacs/nbnxm/atomdata.cpp                | 135 +++++++++++------
 src/gromacs/nbnxm/atomdata.h                  |  37 ++++-
 .../nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh   |  51 ++++++-
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 142 +++++++++++++++++-
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  24 +++
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  14 +-
 src/gromacs/nbnxm/nbnxm.cpp                   |  59 +++++++-
 src/gromacs/nbnxm/nbnxm.h                     |  38 ++++-
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  39 ++++-
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp        |   9 +-
 11 files changed, 551 insertions(+), 97 deletions(-)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 8ab56d27ce..766816c4e7 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -120,12 +120,11 @@
 // PME-first ordering would suffice).
 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 
-// environment variable to enable GPU buffer ops, to alow incremental and optional
+// environment variable to enable GPU buffer ops, to allow incremental and optional
 // introduction of this functionality.
 // TODO eventially tie this in with other existing GPU flags.
 static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 
-
 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
     const int      end = forceToAdd.size();
@@ -692,7 +691,10 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
                 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                              as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
+                                              as_rvec_array(force->unpaddedArrayRef().data()),
+                                              BufferOpsUseGpu::False,
+                                              GpuBufferOpsAccumulateForce::Null,
+                                              wcycle);
             }
         }
     }
@@ -862,8 +864,12 @@ void do_force(FILE                                     *fplog,
         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
+    const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+        && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
+        BufferOpsUseGpu::True : BufferOpsUseGpu::False;
 
-    const bool useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA));
+    const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ?
+        BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1037,11 +1043,17 @@ void do_force(FILE                                     *fplog,
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
         wallcycle_stop(wcycle, ewcNS);
 
-        if (useGpuXBufOps)
+        if (useGpuXBufOps == BufferOpsUseGpu::True)
         {
             nbv->atomdata_init_copy_x_to_nbat_x_gpu();
         }
-
+        // For force buffer ops, we use the below conditon rather than
+        // useGpuFBufOps to ensure that init is performed even if this
+        // NS step is also a virial step (on which f buf ops are deactivated).
+        if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+        {
+            nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle);
+        }
     }
     else
     {
@@ -1057,7 +1069,7 @@ void do_force(FILE                                     *fplog,
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (bNS || !useGpuXBufOps)
+        if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False))
         {
             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
                                       Nbnxm::AtomLocality::Local);
@@ -1121,7 +1133,7 @@ void do_force(FILE                                     *fplog,
         {
             wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
-            if (bNS || !useGpuXBufOps)
+            if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False))
             {
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
@@ -1151,13 +1163,17 @@ void do_force(FILE                                     *fplog,
         /* launch D2H copy-back F */
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
+        bool copyBackNbForce  = (useGpuFBufOps == BufferOpsUseGpu::False);
+
         if (havePPDomainDecomposition(cr))
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      flags, Nbnxm::AtomLocality::NonLocal);
+                                      flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
         }
         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  flags, Nbnxm::AtomLocality::Local);
+                                  flags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1272,8 +1288,10 @@ void do_force(FILE                                     *fplog,
          * communication with calculation with domain decomposition.
          */
         wallcycle_stop(wcycle, ewcFORCE);
-
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f, wcycle);
+        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
+                                      BufferOpsUseGpu::False,
+                                      GpuBufferOpsAccumulateForce::Null,
+                                      wcycle);
 
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
@@ -1309,6 +1327,21 @@ void do_force(FILE                                     *fplog,
                          flags, &forceOut.forceWithVirial, enerd,
                          ed, bNS);
 
+    // flag to specify if CPU force output is preset in force
+    // buffer. For now, this is true even when useGpuPme == true
+    // (because on-GPU PME-nonbonded reduction will be added in
+    // follow-up)
+    // TODO adapt the below when on-GPU PME-nonbonded reduction is available.
+    bool                   useCpuPmeReduction = true;
+    bool                   haveCpuForces      = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction);
+    // flag to specify if forces should be accumulated in force buffer
+    // ops. For now, this is solely determined by above haveCpuForces
+    // flag, but in future developments it will also depend on
+    // e.g. whether the GPU force halo exchange is active.
+    GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+        haveCpuForces ? GpuBufferOpsAccumulateForce::True :
+        GpuBufferOpsAccumulateForce::False;
+
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
     float cycles_wait_gpu = 0;
@@ -1335,8 +1368,16 @@ void do_force(FILE                                     *fplog,
                 wallcycle_stop(wcycle, ewcFORCE);
             }
 
+            if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+            {
+                nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
+            }
             nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceOut.f, wcycle);
+                                          forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            {
+                nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
+            }
 
             if (fr->nbv->emulateGpu() && (flags & GMX_FORCE_VIRIAL))
             {
@@ -1357,13 +1398,18 @@ void do_force(FILE                                     *fplog,
 
         if (bDoForces)
         {
+            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            {
+                nbv->wait_stream_gpu(Nbnxm::AtomLocality::NonLocal);
+            }
             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
         }
     }
 
     // With both nonbonded and PME offloaded a GPU on the same rank, we use
     // an alternating wait/reduction scheme.
-    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr) &&
+                             (useGpuFBufOps == BufferOpsUseGpu::False));
     if (alternateGpuWait)
     {
         alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd,
@@ -1426,6 +1472,25 @@ void do_force(FILE                                     *fplog,
         pme_gpu_reinit_computation(fr->pmedata, wcycle);
     }
 
+    /* Do the nonbonded GPU (or emulation) force buffer reduction
+     * on the non-alternating path. */
+    if (bUseOrEmulGPU && !alternateGpuWait)
+    {
+
+        if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+        {
+            nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
+        }
+        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
+                                      forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+        if (useGpuFBufOps == BufferOpsUseGpu::True)
+        {
+            nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
+            nbv->wait_stream_gpu(Nbnxm::AtomLocality::Local);
+        }
+
+    }
+
     if (bUseGPU)
     {
         /* now clear the GPU outputs while we finish the step on the CPU */
@@ -1457,13 +1522,6 @@ void do_force(FILE                                     *fplog,
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
-    /* Do the nonbonded GPU (or emulation) force buffer reduction
-     * on the non-alternating path. */
-    if (bUseOrEmulGPU && !alternateGpuWait)
-    {
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      forceOut.f, wcycle);
-    }
     if (DOMAINDECOMP(cr))
     {
         dd_force_flop_stop(cr->dd, nrnb);
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index c1271d8bba..8288a1d227 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1010,6 +1010,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
 {
     int gridBegin = 0;
     int gridEnd   = 0;
+
     switch (locality)
     {
         case Nbnxm::AtomLocality::All:
@@ -1437,32 +1438,20 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
     }
 }
 
+
 /* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForces(nbnxn_atomdata_t          *nbat,
-                  const Nbnxm::AtomLocality  locality,
-                  const Nbnxm::GridSet      &gridSet,
-                  rvec                      *f)
+template <bool  useGpu>
+void reduceForces(nbnxn_atomdata_t                   *nbat,
+                  const Nbnxm::AtomLocality           locality,
+                  const Nbnxm::GridSet               &gridSet,
+                  rvec                               *f,
+                  gmx_nbnxn_gpu_t                    *gpu_nbv,
+                  GpuBufferOpsAccumulateForce         accumulateForce)
 {
     int a0 = 0;
     int na = 0;
-    switch (locality)
-    {
-        case Nbnxm::AtomLocality::All:
-            a0 = 0;
-            na = gridSet.numRealAtomsTotal();
-            break;
-        case Nbnxm::AtomLocality::Local:
-            a0 = 0;
-            na = gridSet.numRealAtomsLocal();
-            break;
-        case Nbnxm::AtomLocality::NonLocal:
-            a0 = gridSet.numRealAtomsLocal();
-            na = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal();
-            break;
-        case Nbnxm::AtomLocality::Count:
-            GMX_ASSERT(false, "Count is invalid locality specifier");
-            break;
-    }
+
+    nbnxn_get_atom_range(locality, gridSet, &a0, &na);
 
     if (na == 0)
     {
@@ -1470,41 +1459,67 @@ void reduceForces(nbnxn_atomdata_t          *nbat,
         return;
     }
 
-    int nth = gmx_omp_nthreads_get(emntNonbonded);
+    if (useGpu)
+    {
+        Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
+                                         gpu_nbv,
+                                         a0, na,
+                                         accumulateForce);
 
-    if (nbat->out.size() > 1)
+    }
+    else
     {
-        if (locality != Nbnxm::AtomLocality::All)
-        {
-            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
-        }
+        int nth = gmx_omp_nthreads_get(emntNonbonded);
 
-        /* Reduce the force thread output buffers into buffer 0, before adding
-         * them to the, differently ordered, "real" force buffer.
-         */
-        if (nbat->bUseTreeReduce)
+        if (nbat->out.size() > 1)
         {
-            nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
-        }
-        else
-        {
-            nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+            if (locality != Nbnxm::AtomLocality::All)
+            {
+                gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+            }
+
+            /* Reduce the force thread output buffers into buffer 0, before adding
+             * them to the, differently ordered, "real" force buffer.
+             */
+            if (nbat->bUseTreeReduce)
+            {
+                nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
+            }
+            else
+            {
+                nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+            }
         }
-    }
 #pragma omp parallel for num_threads(nth) schedule(static)
-    for (int th = 0; th < nth; th++)
-    {
-        try
+        for (int th = 0; th < nth; th++)
         {
-            nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
-                                                nbat->out[0],
-                                                a0 + ((th + 0)*na)/nth,
-                                                a0 + ((th + 1)*na)/nth,
-                                                f);
+            try
+            {
+                nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
+                                                    nbat->out[0],
+                                                    a0 + ((th + 0)*na)/nth,
+                                                    a0 + ((th + 1)*na)/nth,
+                                                    f);
+            }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 }
+template
+void reduceForces<true>(nbnxn_atomdata_t             *nbat,
+                        const Nbnxm::AtomLocality     locality,
+                        const Nbnxm::GridSet         &gridSet,
+                        rvec                         *f,
+                        gmx_nbnxn_gpu_t              *gpu_nbv,
+                        GpuBufferOpsAccumulateForce   accumulateForce);
+
+template
+void reduceForces<false>(nbnxn_atomdata_t             *nbat,
+                         const Nbnxm::AtomLocality     locality,
+                         const Nbnxm::GridSet         &gridSet,
+                         rvec                         *f,
+                         gmx_nbnxn_gpu_t              *gpu_nbv,
+                         GpuBufferOpsAccumulateForce   accumulateForce);
 
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
                                               rvec                   *fshift)
@@ -1524,3 +1539,29 @@ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
         rvec_inc(fshift[s], sum);
     }
 }
+
+void nbnxn_get_atom_range(const Nbnxm::AtomLocality        atomLocality,
+                          const Nbnxm::GridSet            &gridSet,
+                          int                             *atomStart,
+                          int                             *nAtoms)
+{
+
+    switch (atomLocality)
+    {
+        case Nbnxm::AtomLocality::All:
+            *atomStart = 0;
+            *nAtoms    = gridSet.numRealAtomsTotal();
+            break;
+        case Nbnxm::AtomLocality::Local:
+            *atomStart = 0;
+            *nAtoms    = gridSet.numRealAtomsLocal();
+            break;
+        case Nbnxm::AtomLocality::NonLocal:
+            *atomStart = gridSet.numRealAtomsLocal();
+            *nAtoms    = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal();
+            break;
+        case Nbnxm::AtomLocality::Count:
+            GMX_ASSERT(false, "Count is invalid locality specifier");
+            break;
+    }
+}
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 4e4e0c0a55..d627732014 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -57,12 +57,16 @@ struct nonbonded_verlet_t;
 struct t_mdatoms;
 struct tMPI_Atomic;
 
+enum class BufferOpsUseGpu;
+
 namespace Nbnxm
 {
 class GridSet;
 enum class KernelType;
 }
 
+enum class GpuBufferOpsAccumulateForce;
+
 /* Convenience type for vector with aligned memory */
 template<typename T>
 using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
@@ -335,13 +339,38 @@ void nbnxn_atomdata_copy_x_to_nbat_x<false>(const Nbnxm::GridSet &,
                                             void *);
 
 //! Add the computed forces to \p f, an internal reduction might be performed as well
-void reduceForces(nbnxn_atomdata_t     *nbat,
-                  Nbnxm::AtomLocality   locality,
-                  const Nbnxm::GridSet &gridSet,
-                  rvec                 *f);
+template <bool  useGpu>
+void reduceForces(nbnxn_atomdata_t                   *nbat,
+                  Nbnxm::AtomLocality                 locality,
+                  const Nbnxm::GridSet               &gridSet,
+                  rvec                               *f,
+                  gmx_nbnxn_gpu_t                    *gpu_nbv,
+                  GpuBufferOpsAccumulateForce         accumulateForce);
+
+
+extern template
+void reduceForces<true>(nbnxn_atomdata_t             *nbat,
+                        const Nbnxm::AtomLocality     locality,
+                        const Nbnxm::GridSet         &gridSet,
+                        rvec                         *f,
+                        gmx_nbnxn_gpu_t              *gpu_nbv,
+                        GpuBufferOpsAccumulateForce   accumulateForce);
+
+extern template
+void reduceForces<false>(nbnxn_atomdata_t             *nbat,
+                         const Nbnxm::AtomLocality     locality,
+                         const Nbnxm::GridSet         &gridSet,
+                         rvec                         *f,
+                         gmx_nbnxn_gpu_t              *gpu_nbv,
+                         GpuBufferOpsAccumulateForce   accumulateForce);
 
 /* Add the fshift force stored in nbat to fshift */
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
                                               rvec                   *fshift);
 
+/* Get the atom start index and number of atoms for a given locality */
+void nbnxn_get_atom_range(Nbnxm::AtomLocality              atomLocality,
+                          const Nbnxm::GridSet            &gridSet,
+                          int                             *atomStart,
+                          int                             *nAtoms);
 #endif
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
index 172d222148..09d6e37243 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -36,8 +36,7 @@
 /*! \internal \file
  *
  * \brief
- * CUDA kernel for GPU version of copy_rvec_to_nbat_real.
- * Converts coordinate data from rvec to nb format.
+ * CUDA kernels for GPU versions of copy_rvec_to_nbat_real and add_nbat_f_to_f.
  *
  *  \author Alan Gray <alang@nvidia.com>
  *  \author Jon Vincent <jvincent@nvidia.com>
@@ -137,3 +136,51 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
     }
 
 }
+
+/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
+ *
+ * \param[in]     fnb     Force in nbat format
+ * \param[in,out] f       Force buffer to be reduced into
+ * \param[in]     cell    Cell index mapping
+ * \param[in]     a0      start atom index
+ * \param[in]     a1      end atom index
+ * \param[in]     stride  stride between atoms in memory
+ */
+template <bool accumulateForce>
+__global__ void
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
+                                 rvec                     * f,
+                                 const int *__restrict__    cell,
+                                 const int                  atomStart,
+                                 const int                  nAtoms);
+template <bool accumulateForce>
+__global__ void
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
+                                 rvec                     * f,
+                                 const int *__restrict__    cell,
+                                 const int                  atomStart,
+                                 const int                  nAtoms)
+{
+
+    /* map particle-level parallelism to 1D CUDA thread and block index */
+    int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
+
+    /* perform addition for each particle*/
+    if (threadIndex < nAtoms)
+    {
+
+        int     i        = cell[atomStart+threadIndex];
+        float3 *f_dest   = (float3 *)&f[atomStart+threadIndex][XX];
+
+        if (accumulateForce)
+        {
+            *f_dest += fnb[i];
+        }
+        else
+        {
+            *f_dest = fnb[i];
+        }
+
+    }
+    return;
+}
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index e8e6c5b5cc..e48713c7ce 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -54,6 +54,7 @@
 #include "nbnxm_cuda.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/nbnxm/gpu_common.h"
@@ -113,6 +114,10 @@
 namespace Nbnxm
 {
 
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_bufOpsThreadsPerBlock = 128;
+
 /*! Nonbonded kernel function pointer type */
 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
                                      const cu_nbparam_t,
@@ -640,7 +645,8 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
                         nbnxn_atomdata_t       *nbatom,
                         const int               flags,
-                        const AtomLocality      atomLocality)
+                        const AtomLocality      atomLocality,
+                        const bool              copyBackNbForce)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -682,8 +688,11 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     }
 
     /* DtoH f */
-    cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
-                      (adat_len)*sizeof(*adat->f), stream);
+    if (copyBackNbForce)
+    {
+        cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
+                          (adat_len)*sizeof(*adat->f), stream);
+    }
 
     /* After the non-local D2H is launched the nonlocal_done event can be
        recorded which signals that the local D2H can proceed. This event is not
@@ -805,13 +814,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     }
 
     /* launch kernel on GPU */
-    const int          threadsPerBlock = 128;
 
     KernelLaunchConfig config;
-    config.blockSize[0]     = threadsPerBlock;
+    config.blockSize[0]     = c_bufOpsThreadsPerBlock;
     config.blockSize[1]     = 1;
     config.blockSize[2]     = 1;
-    config.gridSize[0]      = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock;
+    config.gridSize[0]      = (grid.numCellsColumnMax()*numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)/c_bufOpsThreadsPerBlock;
     config.gridSize[1]      = numColumns;
     config.gridSize[2]      = 1;
     GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
@@ -838,4 +846,126 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
+                               gmx_nbnxn_gpu_t                    *nb,
+                               int                                 atomStart,
+                               int                                 nAtoms,
+                               GpuBufferOpsAccumulateForce         accumulateForce)
+{
+
+    cu_atomdata_t       *adat    = nb->atdat;
+    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
+        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+
+    /* launch kernel */
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_bufOpsThreadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = stream;
+
+    auto              kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ?
+        nbnxn_gpu_add_nbat_f_to_f_kernel<true> : nbnxn_gpu_add_nbat_f_to_f_kernel<false>;
+    const float3     *fPtr                    = adat->f;
+    rvec             *frvec                   = nb->frvec;
+    const int        *cell                    = nb->cell;
+
+    const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
+                                                               &fPtr,
+                                                               &frvec,
+                                                               &cell,
+                                                               &atomStart,
+                                                               &nAtoms);
+
+    launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
+
+}
+
+void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
+                                const Nbnxm::GridSet            &gridSet,
+                                gmx_nbnxn_gpu_t                 *nb,
+                                rvec                            *f)
+{
+    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
+        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+    bool                 bDoTime = nb->bDoTime;
+    cu_timers_t         *t       = nb->timers;
+
+    int                  atomStart = 0, nAtoms = 0;
+
+    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+    }
+
+    rvec       *ptrDest  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
+    rvec       *ptrSrc   = reinterpret_cast<rvec *> (f[atomStart]);
+    //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, nAtoms,
+    //                   stream, GpuApiCallBehavior::Async, nullptr);
+    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
+    cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
+                    stream);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+    }
+
+    return;
+}
+
+void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
+                                  const Nbnxm::GridSet            &gridSet,
+                                  gmx_nbnxn_gpu_t                 *nb,
+                                  rvec                            *f)
+{
+    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
+        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+    bool                 bDoTime = nb->bDoTime;
+    cu_timers_t         *t       = nb->timers;
+
+    int                  atomStart = 0, nAtoms = 0;
+
+    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+    }
+
+    rvec       *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
+    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
+    //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+    //                   stream, GpuApiCallBehavior::Async, nullptr);
+    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
+    cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
+                    stream);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+    }
+
+    return;
+}
+
+void nbnxn_wait_stream_gpu(const AtomLocality      gmx_unused atomLocality,
+                           gmx_nbnxn_gpu_t                   *nb)
+{
+
+    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
+        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+
+    cudaStreamSynchronize(stream);
+
+}
+
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 6b0845219c..a69f758a27 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -61,6 +61,7 @@
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/gridset.h"
 #include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
 #include "gromacs/nbnxm/pairlistsets.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
@@ -503,6 +504,10 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
     nb->ncxy_na_alloc            = 0;
     nb->ncxy_ind                 = 0;
     nb->ncxy_ind_alloc           = 0;
+    nb->nfrvec                   = 0;
+    nb->nfrvec_alloc             = 0;
+    nb->ncell                    = 0;
+    nb->ncell_alloc              = 0;
 
     if (debug)
     {
@@ -956,4 +961,23 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
     return;
 }
 
+/* Initialization for F buffer operations on GPU. */
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
+                                    gmx_nbnxn_gpu_t          *gpu_nbv,
+                                    int                       natoms_total)
+{
+
+    cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
+
+    reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
+
+    if (natoms_total > 0)
+    {
+        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+    }
+
+    return;
+}
+
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index ff8705df59..c18cd3f9a2 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -223,7 +223,19 @@ struct gmx_nbnxn_cuda_t
     int                                                             natoms;
     //! number of atoms allocated in device buffer
     int                                                             natoms_alloc;
-    //! x buf ops input buffer index mapping
+    //! force in rvec format
+    rvec                                                           *frvec;
+    //! number of atoms in force buffer
+    int                                                             nfrvec;
+    //! number of atoms allocated in force buffer
+    int                                                             nfrvec_alloc;
+    //! f buf ops cell index mapping
+    int                                                            *cell;
+    //! number of indices in cell buffer
+    int                                                             ncell;
+    //! number of indices allocated in cell buffer
+    int                                                             ncell_alloc;
+    //! array of atom indices
     int                                                            *atomIndices;
     //! size of atom indices
     int                                                             atomIndicesSize;
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index a81a6b400e..f7208272b5 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -135,14 +135,14 @@ void nonbonded_verlet_t::setAtomProperties(const t_mdatoms          &mdatoms,
 void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality       locality,
                                         const bool                      fillLocal,
                                         gmx::ArrayRef<const gmx::RVec>  x,
-                                        bool                            useGpu,
+                                        BufferOpsUseGpu                 useGpu,
                                         void                           *xPmeDevicePtr,
                                         gmx_wallcycle                  *wcycle)
 {
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 
-    auto fnPtr = useGpu ?
+    auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
         nbnxn_atomdata_copy_x_to_nbat_x<true> :
         nbnxn_atomdata_copy_x_to_nbat_x<false>;
 
@@ -160,10 +160,17 @@ gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
 }
 
 void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality,
-                                             rvec                      *f,
-                                             gmx_wallcycle             *wcycle)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
+                                             rvec                               *f,
+                                             BufferOpsUseGpu                     useGpu,
+                                             GpuBufferOpsAccumulateForce         accumulateForce,
+                                             gmx_wallcycle                      *wcycle)
 {
+
+    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
+                 (accumulateForce == GpuBufferOpsAccumulateForce::True)),
+               "Accumulatation of force is only valid when GPU buffer ops are active");
+
     /* Skip the reduction if there was no short-range GPU work to do
      * (either NB or both NB and bonded work). */
     if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
@@ -174,7 +181,25 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 
-    reduceForces(nbat.get(), locality, pairSearch_->gridSet(), f);
+    auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
+    fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
+
+    wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+    wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+}
+
+void
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
+{
+
+    wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+
+    const Nbnxm::GridSet      &gridSet = pairSearch_->gridSet();
+
+    Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
+                                          gpu_nbv,
+                                          gridSet.numRealAtomsTotal());
 
     wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
@@ -206,4 +231,26 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc
 {
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
+
+void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality)
+{
+    nbnxn_launch_copy_f_to_gpu(locality,
+                               pairSearch_->gridSet(),
+                               gpu_nbv,
+                               f);
+}
+
+void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality)
+{
+    nbnxn_launch_copy_f_from_gpu(locality,
+                                 pairSearch_->gridSet(),
+                                 gpu_nbv,
+                                 f);
+}
+
+void nonbonded_verlet_t::wait_stream_gpu(const Nbnxm::AtomLocality locality)
+{
+    nbnxn_wait_stream_gpu(locality, gpu_nbv);
+}
+
 /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 7e3869b451..1e177a57a0 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -128,6 +128,22 @@ struct t_nrnb;
 struct t_forcerec;
 struct t_inputrec;
 
+/*! \brief Switch for whether to use GPU for buffer ops*/
+enum class BufferOpsUseGpu
+{
+    True,
+    False
+};
+
+/*! \brief Switch for whether forces should accumulate in GPU buffer ops */
+enum class GpuBufferOpsAccumulateForce
+{
+    True,  // Force should be accumulated and format converted
+    False, // Force should be not accumulated, just format converted
+    Null   // GPU buffer ops are not in use, so this object is not applicable
+};
+
+
 namespace gmx
 {
 class MDLogger;
@@ -244,7 +260,7 @@ struct nonbonded_verlet_t
         void setCoordinates(Nbnxm::AtomLocality             locality,
                             bool                            fillLocal,
                             gmx::ArrayRef<const gmx::RVec>  x,
-                            bool                            useGpu,
+                            BufferOpsUseGpu                 useGpu,
                             void                           *xPmeDevicePtr,
                             gmx_wallcycle                  *wcycle);
 
@@ -297,9 +313,23 @@ struct nonbonded_verlet_t
                                       gmx_wallcycle              *wcycle);
 
         //! Add the forces stored in nbat to f, zeros the forces in nbat */
-        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality  locality,
-                                      rvec                *f,
-                                      gmx_wallcycle       *wcycle);
+        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
+                                      rvec                               *f,
+                                      BufferOpsUseGpu                     useGpu,
+                                      GpuBufferOpsAccumulateForce         accumulateForce,
+                                      gmx_wallcycle                      *wcycle);
+
+        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
+        void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle);
+
+        /*! \brief H2D transfer of force buffer*/
+        void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
+        /*! \brief D2H transfer of force buffer*/
+        void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
+        /*! \brief Host sync on device stream given by locality */
+        void wait_stream_gpu(Nbnxm::AtomLocality locality);
 
         //! Return the kernel setup
         const Nbnxm::KernelSetup &kernelSetup() const
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index a1e4800456..3295fdb1f9 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -54,6 +54,7 @@
 
 struct nbnxn_atomdata_t;
 enum class GpuTaskCompletion;
+enum class GpuBufferOpsAccumulateForce;
 
 namespace gmx
 {
@@ -143,7 +144,8 @@ GPU_FUNC_QUALIFIER
 void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
                         nbnxn_atomdata_t gmx_unused *nbatom,
                         int              gmx_unused  flags,
-                        AtomLocality     gmx_unused  aloc) GPU_FUNC_TERM
+                        AtomLocality     gmx_unused  aloc,
+                        const bool       gmx_unused  copyBackNbForce) GPU_FUNC_TERM
 
 /*! \brief Attempts to complete nonbonded GPU task.
  *
@@ -268,7 +270,40 @@ GPU_FUNC_QUALIFIER
 bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
                            const Nbnxm::AtomLocality gmx_unused  aLocality) GPU_FUNC_TERM_WITH_RETURN(false)
 
+/*! \brief Initialization for F buffer operations on GPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
+                                    gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
+                                    int                     gmx_unused  natoms_total) CUDA_FUNC_TERM
+
+/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality           gmx_unused  atomLocality,
+                               gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
+                               int                          gmx_unused  atomStart,
+                               int                          gmx_unused  nAtoms,
+                               GpuBufferOpsAccumulateForce     gmx_unused  accumulateForce) CUDA_FUNC_TERM
+
+/*! \brief Copy force buffer from CPU to GPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_f_to_gpu(const AtomLocality      gmx_unused  atomLocality,
+                                const Nbnxm::GridSet    gmx_unused &gridSet,
+                                gmx_nbnxn_gpu_t         gmx_unused *nb,
+                                rvec                    gmx_unused *f) CUDA_FUNC_TERM
+
+/*! \brief Copy force buffer from GPU to CPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_f_from_gpu(const AtomLocality      gmx_unused  atomLocality,
+                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+                                  rvec                    gmx_unused *f) CUDA_FUNC_TERM
+
+/*! \brief Wait for GPU stream to complete */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_stream_gpu(const AtomLocality      gmx_unused  atomLocality,
+                           gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM
+
 
-} // namespace Nbnxm
+}     // namespace Nbnxm
 
 #endif
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 659c484287..3ac8491795 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -731,10 +731,11 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
  * Launch asynchronously the download of nonbonded forces from the GPU
  * (and energies/shift forces if required).
  */
-void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
-                        struct nbnxn_atomdata_t       *nbatom,
-                        const int                      flags,
-                        const AtomLocality             aloc)
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
+                        struct nbnxn_atomdata_t                  *nbatom,
+                        const int                                 flags,
+                        const AtomLocality                        aloc,
+                        const bool                    gmx_unused  copyBackNbForce)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-- 
2.22.0