From 257d8094c09479fe3f7bd9386013056453f26455 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Tue, 29 Sep 2020 11:27:18 +0000
Subject: [PATCH] Redevelopment of GPU Force Reduction/Buffer Ops

Introduces a new purpose-build class for GPU force reduction, which
replaces the previous force buffer ops mechanism.

Refs #3370
---
 src/gromacs/ewald/pme_pp_comm_gpu.h           |   2 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp    |   2 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |   6 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |   2 +-
 src/gromacs/mdlib/CMakeLists.txt              |   1 +
 src/gromacs/mdlib/forcerec.cpp                |   2 +
 src/gromacs/mdlib/gpuforcereduction.h         | 122 +++++++++
 src/gromacs/mdlib/gpuforcereduction_impl.cpp  | 105 ++++++++
 src/gromacs/mdlib/gpuforcereduction_impl.cu   | 234 ++++++++++++++++++
 src/gromacs/mdlib/gpuforcereduction_impl.cuh  | 144 +++++++++++
 src/gromacs/mdlib/sim_util.cpp                | 137 ++++++----
 src/gromacs/mdrun/runner.cpp                  |  11 +
 src/gromacs/mdtypes/forcerec.h                |   6 +
 src/gromacs/nbnxm/atomdata.cpp                |  25 --
 src/gromacs/nbnxm/atomdata.h                  |  20 --
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  73 +-----
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  25 --
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  14 --
 src/gromacs/nbnxm/nbnxm.cpp                   |  51 ++--
 src/gromacs/nbnxm/nbnxm.h                     |  14 +-
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  42 +---
 21 files changed, 755 insertions(+), 283 deletions(-)
 create mode 100644 src/gromacs/mdlib/gpuforcereduction.h
 create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cpp
 create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cu
 create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cuh

diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h
index 97accca871..a3e0239e58 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -103,7 +103,7 @@ public:
     /*! \brief
      * Return pointer to event recorded when forces are ready
      */
-    void* getForcesReadySynchronizer();
+    GpuEventSynchronizer* getForcesReadySynchronizer();
 
 private:
     class Impl;
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
index 0259cd0229..d19004e7ed 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -110,7 +110,7 @@ void* PmePpCommGpu::getGpuForceStagingPtr()
     return nullptr;
 }
 
-void* PmePpCommGpu::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 2c6f696ddd..acb5998b02 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -154,9 +154,9 @@ void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
     return static_cast<void*>(d_pmeForces_);
 }
 
-void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
 {
-    return static_cast<void*>(&forcesReadySynchronizer_);
+    return &forcesReadySynchronizer_;
 }
 
 PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
@@ -193,7 +193,7 @@ void* PmePpCommGpu::getGpuForceStagingPtr()
     return impl_->getGpuForceStagingPtr();
 }
 
-void* PmePpCommGpu::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
 {
     return impl_->getForcesReadySynchronizer();
 }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 4c95d9bccd..0630084e59 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -115,7 +115,7 @@ public:
     /*! \brief
      * Return pointer to event recorded when forces are ready
      */
-    void* getForcesReadySynchronizer();
+    GpuEventSynchronizer* getForcesReadySynchronizer();
 
 private:
     //! GPU context handle (not used in CUDA)
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index c279548147..6cb02ee517 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -45,6 +45,7 @@ if(GMX_GPU_CUDA)
        lincs_gpu.cu
        settle_gpu.cu
        update_constrain_gpu_impl.cu
+       gpuforcereduction_impl.cu
        )
 endif()
 
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 34c10d3001..6144b3a2fa 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -100,6 +100,8 @@
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/strconvert.h"
 
+#include "gpuforcereduction.h"
+
 ForceHelperBuffers::ForceHelperBuffers(bool haveDirectVirialContributions) :
     haveDirectVirialContributions_(haveDirectVirialContributions)
 {
diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h
new file mode 100644
index 0000000000..de8aebbf76
--- /dev/null
+++ b/src/gromacs/mdlib/gpuforcereduction.h
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares the GPU Force Reduction
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_GPUFORCEREDUCTION_H
+#define GMX_MDLIB_GPUFORCEREDUCTION_H
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/fixedcapacityvector.h"
+
+class GpuEventSynchronizer;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages the force reduction directly in GPU memory
+ *
+ * Manages the reduction of multiple GPU force buffers into a single
+ * GPU force buffer. The reduction involves at least one (input/output)
+ * Rvec-format buffer and one (input) Nbat-format buffer, where the
+ * Nbat->Rvec conversion is handled internally. One additional (input)
+ * Rvec-format buffer is supported as optional.
+ */
+class GpuForceReduction
+{
+
+public:
+    /*! \brief Creates GPU force reduction object
+     *
+     * \param [in] deviceContext GPU device context
+     * \param [in] deviceStream  Stream to use for reduction
+     */
+    GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    ~GpuForceReduction();
+
+    /*! \brief Register a nbnxm-format force to be reduced
+     *
+     * \param [in] forcePtr  Pointer to force to be reduced
+     */
+    void registerNbnxmForce(void* forcePtr);
+
+    /*! \brief Register a rvec-format force to be reduced
+     *
+     * \param [in] forcePtr  Pointer to force to be reduced
+     */
+    void registerRvecForce(void* forcePtr);
+
+    /*! \brief Add a dependency for this force reduction
+     *
+     * \param [in] dependency   Dependency for this reduction
+     */
+    void addDependency(GpuEventSynchronizer* dependency);
+
+    /*! \brief Reinitialize the GPU force reduction
+     *
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
+     * \param [in] numAtoms         The number of atoms
+     * \param [in] cell             Pointer to the cell array
+     * \param [in] atomStart        The start atom for the reduction
+     * \param [in] accumulate       Whether reduction should be accumulated
+     * \param [in] completionMarker Event to be marked when launch of reduction is complete
+     */
+    void reinit(DeviceBuffer<RVec>    baseForcePtr,
+                int                   numAtoms,
+                ArrayRef<const int>   cell,
+                int                   atomStart,
+                bool                  accumulate,
+                GpuEventSynchronizer* completionMarker = nullptr);
+
+    /*! \brief Execute the force reduction */
+    void execute();
+
+private:
+    class Impl;
+    gmx::PrivateImplPointer<Impl> impl_;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
new file mode 100644
index 0000000000..ff32deef96
--- /dev/null
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -0,0 +1,105 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement force reduction interfaces for non-GPU builds.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gpuforcereduction.h"
+
+#if !GMX_GPU_CUDA
+
+namespace gmx
+{
+
+class GpuForceReduction::Impl
+{
+};
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
+                                     const DeviceStream& /* deviceStream */) :
+    impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
+                               const int /*numAtoms*/,
+                               ArrayRef<const int> /*cell*/,
+                               const int /*atomStart*/,
+                               const bool /*accumulate*/,
+                               GpuEventSynchronizer* /*completionMarker*/)
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */)
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::execute()
+{
+    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+GpuForceReduction::~GpuForceReduction() = default;
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu
new file mode 100644
index 0000000000..29c18daf33
--- /dev/null
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -0,0 +1,234 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU Force Reduction using CUDA
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "gpuforcereduction_impl.cuh"
+
+#include <stdio.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+#include "gpuforcereduction.h"
+
+namespace gmx
+{
+
+constexpr static int c_threadsPerBlock = 128;
+
+typedef struct rvecDeviceForceData rvecDeviceForceData_t;
+
+
+template<bool addRvecForce, bool accumulateForce>
+static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
+                                    const float3* __restrict__ rvecForceToAdd,
+                                    float3*    gm_fTotal,
+                                    const int* gm_cell,
+                                    const int  numAtoms)
+{
+
+    // map particle-level parallelism to 1D CUDA thread and block index
+    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // perform addition for each particle
+    if (threadIndex < numAtoms)
+    {
+
+        float3* gm_fDest = &gm_fTotal[threadIndex];
+        float3  temp;
+
+        // Accumulate or set nbnxm force
+        if (accumulateForce)
+        {
+            temp = *gm_fDest;
+            temp += gm_nbnxmForce[gm_cell[threadIndex]];
+        }
+        else
+        {
+            temp = gm_nbnxmForce[gm_cell[threadIndex]];
+        }
+
+        if (addRvecForce)
+        {
+            temp += rvecForceToAdd[threadIndex];
+        }
+
+        *gm_fDest = temp;
+    }
+    return;
+}
+
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream){};
+
+void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
+{
+    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    baseForce_        = &(baseForcePtr[atomStart]);
+    numAtoms_         = numAtoms;
+    atomStart_        = atomStart;
+    accumulate_       = accumulate;
+    completionMarker_ = completionMarker;
+    cellInfo_.cell    = cell.data();
+    reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
+                           &cellInfo_.cellSizeAlloc, deviceContext_);
+    copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
+                       GpuApiCallBehavior::Async, nullptr);
+
+    dependencyList_.clear();
+};
+
+void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+{
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
+    nbnxmForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+{
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
+    rvecForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+{
+    dependencyList_.push_back(dependency);
+}
+
+void GpuForceReduction::Impl::execute()
+{
+
+    if (numAtoms_ == 0)
+    {
+        return;
+    }
+
+    GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
+
+    // Enqueue wait on all dependencies passed
+    for (auto const synchronizer : dependencyList_)
+    {
+        synchronizer->enqueueWaitEvent(deviceStream_);
+    }
+
+    float3* d_nbnxmForce     = asFloat3(nbnxmForceToAdd_);
+    float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
+
+    // Configure and launch kernel
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = ((numAtoms_ + 1) + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+
+    auto kernelFn = (rvecForceToAdd_ != nullptr)
+                            ? (accumulate_ ? reduceKernel<true, true> : reduceKernel<true, false>)
+                            : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
+
+    const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd,
+                                                      &baseForce_, &cellInfo_.d_cell, &numAtoms_);
+
+    launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
+
+    // Mark that kernel has been launched
+    if (completionMarker_ != nullptr)
+    {
+        completionMarker_->markEvent(deviceStream_);
+    }
+}
+
+GpuForceReduction::Impl::~Impl(){};
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+    impl_(new Impl(deviceContext, deviceStream))
+{
+}
+
+void GpuForceReduction::registerNbnxmForce(void* forcePtr)
+{
+    impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+}
+
+void GpuForceReduction::registerRvecForce(void* forcePtr)
+{
+    impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+}
+
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+{
+    impl_->addDependency(dependency);
+}
+
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
+                               const int             numAtoms,
+                               ArrayRef<const int>   cell,
+                               const int             atomStart,
+                               const bool            accumulate,
+                               GpuEventSynchronizer* completionMarker)
+{
+    impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
+}
+void GpuForceReduction::execute()
+{
+    impl_->execute();
+}
+
+GpuForceReduction::~GpuForceReduction() = default;
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
new file mode 100644
index 0000000000..536e3fd33b
--- /dev/null
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
@@ -0,0 +1,144 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares the GPU Force Reduction
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H
+#define GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
+
+#include "gpuforcereduction.h"
+
+namespace gmx
+{
+
+//! structure to hold cell information for any nbat-format forces
+struct cellInfo
+{
+    //! cell index mapping for any nbat-format forces
+    const int* cell = nullptr;
+    //! device copy of cell index mapping for any nbat-format forces
+    int* d_cell = nullptr;
+    //! number of atoms in cell array
+    int cellSize = -1;
+    //! number of atoms allocated in cell array
+    int cellSizeAlloc = -1;
+};
+
+class GpuForceReduction::Impl
+{
+
+public:
+    /*! \brief Creates GPU force reduction object
+     *
+     * \param [in] deviceStream  Stream to use for reduction
+     * \param [in] deviceContext GPU device context
+     */
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    ~Impl();
+
+    /*! \brief Register a nbnxm-format force to be reduced
+     *
+     * \param [in] forcePtr  Pointer to force to be reduced
+     */
+    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+
+    /*! \brief Register a rvec-format force to be reduced
+     *
+     * \param [in] forcePtr  Pointer to force to be reduced
+     */
+    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+
+    /*! \brief Add a dependency for this force reduction
+     *
+     * \param [in] dependency   Dependency for this reduction
+     */
+    void addDependency(GpuEventSynchronizer* const dependency);
+
+    /*! \brief Reinitialize the GPU force reduction
+     *
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
+     * \param [in] numAtoms         The number of atoms
+     * \param [in] cell             Pointer to the cell array
+     * \param [in] atomStart        The start atom for the reduction
+     * \param [in] accumulate       Whether reduction should be accumulated
+     * \param [in] completionMarker Event to be marked when launch of reduction is complete
+     */
+    void reinit(float3*               baseForcePtr,
+                const int             numAtoms,
+                ArrayRef<const int>   cell,
+                const int             atomStart,
+                const bool            accumulate,
+                GpuEventSynchronizer* completionMarker = nullptr);
+
+    /*! \brief Execute the force reduction */
+    void execute();
+
+private:
+    //! force to be used as a base for this reduction
+    float3* baseForce_ = nullptr;
+    //! starting atom
+    int atomStart_ = 0;
+    //! number of atoms
+    int numAtoms_ = 0;
+    //! whether reduction is accumulated into base force buffer
+    int accumulate_ = true;
+    //! cell information for any nbat-format forces
+    struct cellInfo cellInfo_;
+    //! GPU context object
+    const DeviceContext& deviceContext_;
+    //! list of dependencies
+    gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList_;
+    //! stream to be used for this reduction
+    const DeviceStream& deviceStream_;
+    //! Nbnxm force to be added in this reduction
+    DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
+    //! Rvec-format force to be added in this reduction
+    DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
+    //! event to be marked when redcution launch has been completed
+    GpuEventSynchronizer* completionMarker_ = nullptr;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index f8befd3ab7..eed67b4540 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -121,6 +121,8 @@
 #include "gromacs/utility/strconvert.h"
 #include "gromacs/utility/sysinfo.h"
 
+#include "gpuforcereduction.h"
+
 using gmx::ArrayRef;
 using gmx::AtomLocality;
 using gmx::DomainLifetimeWorkload;
@@ -1006,6 +1008,84 @@ static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
     }
 }
 
+
+/*! \brief Setup for the local and non-local GPU force reductions:
+ * reinitialization plus the registration of forces and dependencies.
+ *
+ * \param [in] runScheduleWork               Schedule workload flag structure
+ * \param [in] cr                            Communication record object
+ * \param [in] fr                            Force record object
+ * \param [in] ddUsesGpuDirectCommunication  Whether GPU direct communication is in use
+ */
+static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
+                                    const t_commrec*            cr,
+                                    t_forcerec*                 fr,
+                                    bool                        ddUsesGpuDirectCommunication)
+{
+
+    nonbonded_verlet_t*          nbv      = fr->nbv.get();
+    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+
+    // (re-)initialize local GPU force reduction
+    const bool accumulate =
+            runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+    const int atomStart = 0;
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+            stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(),
+            atomStart, accumulate, stateGpu->fReducedOnDevice());
+
+    // register forces and add dependencies
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
+
+    if (runScheduleWork->simulationWork.useGpuPme
+        && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+    {
+        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+                                                       : // PME force buffer on same GPU
+                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
+
+        GpuEventSynchronizer* const pmeSynchronizer =
+                (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
+                                               : // PME force buffer on same GPU
+                         fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+    }
+
+    if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
+        && !ddUsesGpuDirectCommunication)
+    {
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+                stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
+    }
+
+    if (ddUsesGpuDirectCommunication)
+    {
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+                cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+    }
+
+    if (havePPDomainDecomposition(cr))
+    {
+        // (re-)initialize non-local GPU force reduction
+        const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
+                                || runScheduleWork->domainWork.haveFreeEnergyWork;
+        const int atomStart = dd_numHomeAtoms(*cr->dd);
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+                stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal),
+                nbv->getGridIndices(), atomStart, accumulate);
+
+        // register forces and add dependencies
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
+        if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
+        {
+            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
+        }
+    }
+}
+
+
 void do_force(FILE*                               fplog,
               const t_commrec*                    cr,
               const gmx_multisim_t*               ms,
@@ -1191,6 +1271,8 @@ void do_force(FILE*                               fplog,
         launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
     }
 
+    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
+
     /* do gridding for pair search */
     if (stepWork.doNeighborSearch)
     {
@@ -1276,13 +1358,10 @@ void do_force(FILE*                               fplog,
         {
             nbv->atomdata_init_copy_x_to_nbat_x_gpu();
         }
-        // For force buffer ops, we use the below conditon rather than
-        // useGpuFBufferOps to ensure that init is performed even if this
-        // NS step is also a virial step (on which f buf ops are deactivated).
-        if (GMX_GPU_CUDA && simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded)
+
+        if (simulationWork.useGpuBufferOps)
         {
-            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
-            nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
+            setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
         }
     }
     else if (!EI_TPI(inputrec->eI))
@@ -1306,8 +1385,6 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
     if (simulationWork.useGpuNonbonded)
     {
         ddBalanceRegionHandler.openBeforeForceComputationGpu();
@@ -1686,8 +1763,6 @@ void do_force(FILE*                               fplog,
 
             if (stepWork.useGpuFBufferOps)
             {
-                gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
-
                 // TODO: move this into DomainLifetimeWorkload, including the second part of the
                 // condition The bonded and free energy CPU tasks can have non-local force
                 // contributions which are a dependency for the GPU force reduction.
@@ -1698,13 +1773,10 @@ void do_force(FILE*                               fplog,
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
                                               AtomLocality::NonLocal);
-                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                            AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
                 }
 
-                nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
-                                                  pme_gpu_get_device_f(fr->pmedata), dependencyList,
-                                                  false, haveNonLocalForceContribInCpuBuffer);
+                fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
+
                 if (!useGpuForcesHaloExchange)
                 {
                     // copy from GPU input for dd_move_f()
@@ -1830,29 +1902,6 @@ void do_force(FILE*                               fplog,
      * on the non-alternating path. */
     if (useOrEmulateGpuNb && !alternateGpuWait)
     {
-        // TODO simplify the below conditionals. Pass buffer and sync pointers at init stage rather than here. Unify getter fns for sameGPU/otherGPU cases.
-        void* pmeForcePtr =
-                stepWork.useGpuPmeFReduction
-                        ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                                         : // PME force buffer on same GPU
-                                   fr->pmePpCommGpu->getGpuForceStagingPtr()) // buffer received from other GPU
-                        : nullptr; // PME reduction not active on GPU
-
-        GpuEventSynchronizer* const pmeSynchronizer =
-                stepWork.useGpuPmeFReduction
-                        ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
-                                                         : // PME force buffer on same GPU
-                                   static_cast<GpuEventSynchronizer*>(
-                                           fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
-                        : nullptr; // PME reduction not active on GPU
-
-        gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
-
-        if (stepWork.useGpuPmeFReduction)
-        {
-            dependencyList.push_back(pmeSynchronizer);
-        }
-
         gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
 
         if (stepWork.useGpuFBufferOps)
@@ -1881,16 +1930,10 @@ void do_force(FILE*                               fplog,
                 auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
 
                 stateGpu->copyForcesToGpu(forceWithShift, locality);
-                dependencyList.push_back(
-                        stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
-            }
-            if (useGpuForcesHaloExchange)
-            {
-                dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
             }
-            nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
-                                              dependencyList, stepWork.useGpuPmeFReduction,
-                                              haveLocalForceContribInCpuBuffer);
+
+            fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
+
             // Copy forces to host if they are needed for update or if virtual sites are enabled.
             // If there are vsites, we need to copy forces every step to spread vsite forces on host.
             // TODO: When the output flags will be included in step workload, this copy can be combined with the
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index e774622f2d..c88d8adbea 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -95,6 +95,7 @@
 #include "gromacs/mdlib/force.h"
 #include "gromacs/mdlib/forcerec.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/gpuforcereduction.h"
 #include "gromacs/mdlib/makeconstraints.h"
 #include "gromacs/mdlib/md_support.h"
 #include "gromacs/mdlib/mdatoms.h"
@@ -1672,6 +1673,16 @@ int Mdrunner::mdrunner()
                             domdecOptions.checkBondedInteractions, fr->cginfo_mb);
         }
 
+        if (runScheduleWork.simulationWork.useGpuBufferOps)
+        {
+            fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
+                    deviceStreamManager->context(),
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+            fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
+                    deviceStreamManager->context(),
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+        }
+
         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
         if (gpusWereDetected
             && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index d5f084faed..4fa0a03f80 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -49,6 +49,8 @@
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
+#include "locality.h"
+
 /* Abstract type for PME that is defined only in the routine that use them. */
 struct gmx_pme_t;
 struct nonbonded_verlet_t;
@@ -64,6 +66,7 @@ namespace gmx
 {
 class DeviceStreamManager;
 class GpuBonded;
+class GpuForceReduction;
 class ForceProviders;
 class StatePropagatorDataGpu;
 class PmePpCommGpu;
@@ -325,6 +328,9 @@ struct t_forcerec
 
     /* For PME-PP GPU communication */
     std::unique_ptr<gmx::PmePpCommGpu> pmePpCommGpu;
+
+    /* For GPU force reduction (on both local and non-local atoms) */
+    gmx::EnumerationArray<gmx::AtomLocality, std::unique_ptr<gmx::GpuForceReduction>> gpuForceReduction;
 };
 
 /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 12003eb236..dd4eddd0ca 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1468,31 +1468,6 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons
     }
 }
 
-/* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForcesGpu(const gmx::AtomLocality                    locality,
-                     DeviceBuffer<RVec>                         totalForcesDevice,
-                     const Nbnxm::GridSet&                      gridSet,
-                     void*                                      pmeForcesDevice,
-                     gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                     NbnxmGpu*                                  gpu_nbv,
-                     bool                                       useGpuFPmeReduction,
-                     bool                                       accumulateForce)
-{
-    int atomsStart = 0;
-    int numAtoms   = 0;
-
-    nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms);
-
-    if (numAtoms == 0)
-    {
-        /* The are no atoms for this reduction, avoid some overhead */
-        return;
-    }
-
-    Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, totalForcesDevice, gpu_nbv, pmeForcesDevice, dependencyList,
-                                     atomsStart, numAtoms, useGpuFPmeReduction, accumulateForce);
-}
-
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift)
 {
     gmx::ArrayRef<const nbnxn_atomdata_output_t> outputBuffers = nbat.out;
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 007e2268ef..df3362b114 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -381,26 +381,6 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
  */
 void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, rvec* totalForce);
 
-/*! \brief Reduce forces on the GPU
- *
- * \param[in]  locality             If the reduction should be performed on local or non-local atoms.
- * \param[out] totalForcesDevice    Device buffer to accumulate resulting force.
- * \param[in]  gridSet              The grids data.
- * \param[in]  pmeForcesDevice      Device buffer with PME forces.
- * \param[in]  dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
- * \param[in]  gpu_nbv              The NBNXM GPU data structure.
- * \param[in]  useGpuFPmeReduction  Whether PME forces should be added.
- * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
- */
-void reduceForcesGpu(gmx::AtomLocality                          locality,
-                     DeviceBuffer<gmx::RVec>                    totalForcesDevice,
-                     const Nbnxm::GridSet&                      gridSet,
-                     void*                                      pmeForcesDevice,
-                     gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                     NbnxmGpu*                                  gpu_nbv,
-                     bool                                       useGpuFPmeReduction,
-                     bool                                       accumulateForce);
-
 //! Add the fshift force stored in nbat to fshift
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift);
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 5f41fda5ef..48f02fde9e 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -894,78 +894,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
- *
- * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
- *       Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
- *       forces only after Local stream already done so.
- */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
-                               DeviceBuffer<gmx::RVec>                    totalForcesDevice,
-                               NbnxmGpu*                                  nb,
-                               void*                                      pmeForcesDevice,
-                               gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                               int                                        atomStart,
-                               int                                        numAtoms,
-                               bool                                       useGpuFPmeReduction,
-                               bool                                       accumulateForce)
+void* getGpuForces(NbnxmGpu* nb)
 {
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-    GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
-    GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
-
-    const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
-    const DeviceStream&       deviceStream = *nb->deviceStreams[iLocality];
-    cu_atomdata_t*            adat         = nb->atdat;
-
-    size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
-                                             + static_cast<size_t>((accumulateForce == true));
-    GMX_ASSERT(numDependency >= dependencyList.size(),
-               "Mismatching number of dependencies and call signature");
-
-    // Enqueue wait on all dependencies passed
-    for (auto const synchronizer : dependencyList)
-    {
-        synchronizer->enqueueWaitEvent(deviceStream);
-    }
-
-    /* launch kernel */
-
-    KernelLaunchConfig config;
-    config.blockSize[0] = c_bufOpsThreadsPerBlock;
-    config.blockSize[1] = 1;
-    config.blockSize[2] = 1;
-    config.gridSize[0]  = ((numAtoms + 1) + c_bufOpsThreadsPerBlock - 1) / c_bufOpsThreadsPerBlock;
-    config.gridSize[1]  = 1;
-    config.gridSize[2]  = 1;
-    config.sharedMemorySize = 0;
-
-    auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
-                                    : nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
-
-    if (useGpuFPmeReduction)
-    {
-        GMX_ASSERT(pmeForcesDevice, "Need a valid pmeForcesDevice pointer");
-        kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, true>
-                                   : nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
-    }
-
-    const float3* d_fNB    = adat->f;
-    const float3* d_fPme   = static_cast<float3*>(pmeForcesDevice);
-    float3*       d_fTotal = asFloat3(totalForcesDevice);
-    const int*    d_cell   = nb->cell;
-
-    const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
-                                                      &d_cell, &atomStart, &numAtoms);
-
-    launchGpuKernel(kernelFn, config, deviceStream, nullptr, "FbufferOps", kernelArgs);
-
-    if (atomLocality == AtomLocality::Local)
-    {
-        GMX_ASSERT(nb->localFReductionDone != nullptr,
-                   "localFReductionDone has to be a valid pointer");
-        nb->localFReductionDone->markEvent(deviceStream);
-    }
+    return nb->atdat->f;
 }
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index f754cc4795..b1d6774a26 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -321,8 +321,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     nb->ncxy_na_alloc         = 0;
     nb->ncxy_ind              = 0;
     nb->ncxy_ind_alloc        = 0;
-    nb->ncell                 = 0;
-    nb->ncell_alloc           = 0;
 
     if (debug)
     {
@@ -659,27 +657,4 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
     return;
 }
 
-/* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
-                                    NbnxmGpu*                   gpu_nbv,
-                                    int                         natoms_total,
-                                    GpuEventSynchronizer* const localReductionDone)
-{
-
-    const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
-
-    GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
-    gpu_nbv->localFReductionDone = localReductionDone;
-
-    if (natoms_total > 0)
-    {
-        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
-                               *gpu_nbv->deviceContext_);
-        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
-                           GpuApiCallBehavior::Async, nullptr);
-    }
-
-    return;
-}
-
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index 1044c10162..7c92a1abdc 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -158,12 +158,6 @@ struct NbnxmGpu
     bool bUseTwoStreams = false;
     /*! \brief atom data */
     cu_atomdata_t* atdat = nullptr;
-    /*! \brief f buf ops cell index mapping */
-    int* cell = nullptr;
-    /*! \brief number of indices in cell buffer */
-    int ncell = 0;
-    /*! \brief number of indices allocated in cell buffer */
-    int ncell_alloc = 0;
     /*! \brief array of atom indices */
     int* atomIndices = nullptr;
     /*! \brief size of atom indices */
@@ -214,14 +208,6 @@ struct NbnxmGpu
      * will be true. */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
 
-    /*! \brief Pointer to event synchronizer triggered when the local
-     * GPU buffer ops / reduction is complete
-     *
-     * \note That the synchronizer is managed outside of this module
-     * in StatePropagatorDataGpu.
-     */
-    GpuEventSynchronizer* localFReductionDone = nullptr;
-
     /*! \brief Event triggered when non-local coordinate buffer
      * has been copied from device to host. */
     GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 67890a9606..76e2eead9f 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -179,48 +179,27 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
-void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
-                                                      DeviceBuffer<gmx::RVec> totalForcesDevice,
-                                                      void*                   forcesPmeDevice,
-                                                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                                                      bool useGpuFPmeReduction,
-                                                      bool accumulateForce)
+int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality)
 {
-
-    GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
-               "GPU PME force reduction is only valid when a non-null GPU PME force pointer is "
-               "available");
-
-    /* Skip the reduction if there was no short-range GPU work to do
-     * (either NB or both NB and bonded work). */
-    if (!pairlistIsSimple() && !Nbnxm::haveGpuShortRangeWork(gpu_nbv, locality))
+    int numAtoms = 0;
+    switch (locality)
     {
-        return;
+        case gmx::AtomLocality::All: numAtoms = pairSearch_->gridSet().numRealAtomsTotal(); break;
+        case gmx::AtomLocality::Local: numAtoms = pairSearch_->gridSet().numRealAtomsLocal(); break;
+        case gmx::AtomLocality::NonLocal:
+            numAtoms = pairSearch_->gridSet().numRealAtomsTotal()
+                       - pairSearch_->gridSet().numRealAtomsLocal();
+            break;
+        case gmx::AtomLocality::Count:
+            GMX_ASSERT(false, "Count is invalid locality specifier");
+            break;
     }
-
-    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
-
-    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice,
-                    dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
-
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    return numAtoms;
 }
 
-void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
+void* nonbonded_verlet_t::getGpuForces()
 {
-
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
-
-    const Nbnxm::GridSet& gridSet = pairSearch_->gridSet();
-
-    Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(), gpu_nbv,
-                                          gridSet.numRealAtomsTotal(), localReductionDone);
-
-    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    return Nbnxm::getGpuForces(gpu_nbv);
 }
 
 real nonbonded_verlet_t::pairlistInnerRadius() const
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index bcb3b34c33..2596350e52 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -358,14 +358,18 @@ public:
                                       bool useGpuFPmeReduction,
                                       bool accumulateForce);
 
-    /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+    /*! \brief Get the number of atoms for a given locality
      *
-     * \param localReductionDone     Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+     * \param [in] locality   Local or non-local
+     * \returns               The number of atoms for given locality
      */
-    void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
+    int getNumAtoms(gmx::AtomLocality locality);
 
-    /*! \brief return GPU pointer to f in rvec format */
-    void* get_gpu_frvec();
+    /*! \brief Get the pointer to the GPU nonbonded force buffer
+     *
+     * \returns A pointer to the force buffer in GPU memory
+     */
+    void* getGpuForces();
 
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index eace699386..00e7ae11f5 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -340,45 +340,19 @@ GPU_FUNC_QUALIFIER
 bool haveGpuShortRangeWork(const NbnxmGpu gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality)
         GPU_FUNC_TERM_WITH_RETURN(false);
 
-/*! \brief Initialization for F buffer operations on GPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
-                                    NbnxmGpu gmx_unused* gpu_nbv,
-                                    int gmx_unused       natoms_total,
-                                    GpuEventSynchronizer gmx_unused* localReductionDone) CUDA_FUNC_TERM;
-
-/*! \brief Force buffer operations on GPU.
- *
- * Transforms non-bonded forces into plain rvec format and add all the force components to the total
- * force buffer
- *
- * \param[in]   atomLocality         If the reduction should be performed on local or non-local atoms.
- * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
- * \param[in]   gpu_nbv              The NBNXM GPU data structure.
- * \param[in]   pmeForcesDevice      Device buffer with PME forces.
- * \param[in]   dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
- * \param[in]   atomStart            Index of the first atom to reduce forces for.
- * \param[in]   numAtoms             Number of atoms to reduce forces for.
- * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
- * \param[in]   accumulateForce      Whether there are usefull data already in the total force buffer.
- *
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
-                               DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
-                               NbnxmGpu gmx_unused* gpu_nbv,
-                               void gmx_unused*                           pmeForcesDevice,
-                               gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
-                               int gmx_unused atomStart,
-                               int gmx_unused numAtoms,
-                               bool gmx_unused useGpuFPmeReduction,
-                               bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
-
 /*! \brief sync CPU thread on coordinate copy to device
  * \param[in] nb                   The nonbonded data GPU structure
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
 
+/*! \brief Get the pointer to the GPU nonbonded force buffer
+ *
+ * \param[in] nb  The nonbonded data GPU structure
+ * \returns       A pointer to the force buffer in GPU memory
+ */
+CUDA_FUNC_QUALIFIER
+void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
 } // namespace Nbnxm
 #endif
-- 
2.22.0