From 0a4ca2c479237e915d96bcff8500c282a6ec3050 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Fri, 10 May 2019 03:28:58 -0700
Subject: [PATCH] PME reduction for CUDA F buffer operations

Enable with GMX_USE_GPU_BUFFER_OPS env variable.

Provides functionality to perform reduction of PME forces in F buffer
ops kernel.  Currently active when single GPU performs both PME and PP
(multi-GPU support will follow in patch which perfoms PME/PP comms
direct between GPUs). When active, Device->Host copy of PME force
and CPU-side reduction is disabled.

Implements part of #3029, refs #2817

Change-Id: I3e66b6919c1e86bf0bed42b74136f8694626910b
---
 src/gromacs/ewald/pme.h                       | 27 ++++++-
 src/gromacs/ewald/pme_gpu.cpp                 | 41 +++++++++--
 src/gromacs/ewald/pme_gpu_internal.cpp        | 36 +++++++++-
 src/gromacs/ewald/pme_gpu_internal.h          | 21 +++++-
 src/gromacs/ewald/pme_gpu_types_host_impl.h   |  2 +
 src/gromacs/ewald/pme_only.cpp                |  2 +-
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  2 +-
 src/gromacs/mdlib/sim_util.cpp                | 71 +++++++++++--------
 src/gromacs/nbnxm/atomdata.cpp                | 29 +++++---
 src/gromacs/nbnxm/atomdata.h                  | 19 +++--
 .../nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh   | 35 +++++----
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 49 +++++++++----
 src/gromacs/nbnxm/nbnxm.cpp                   | 34 +++++++--
 src/gromacs/nbnxm/nbnxm.h                     | 31 +++++---
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  6 +-
 15 files changed, 300 insertions(+), 105 deletions(-)

diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 29cc106554..a660390135 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -73,6 +73,7 @@ struct NumPmeDomains;
 
 enum class GpuTaskCompletion;
 class PmeGpuProgram;
+class GpuEventSynchronizer;
 //! Convenience name.
 using PmeGpuProgramHandle = const PmeGpuProgram *;
 
@@ -382,10 +383,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t       *GPU_F
  * \param[in]  forceTreatment    Tells how data should be treated. The gathering kernel either stores
  *                               the output reciprocal forces into the host array, or copies its contents to the GPU first
  *                               and accumulates. The reduction is non-atomic.
+ * \param[in]  useGpuFPmeReduction  Whether PME forces are reduced on GPU
  */
 GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t        *GPU_FUNC_ARGUMENT(pme),
                                               gmx_wallcycle          *GPU_FUNC_ARGUMENT(wcycle),
-                                              PmeForceOutputHandling  GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM;
+                                              PmeForceOutputHandling  GPU_FUNC_ARGUMENT(forceTreatment),
+                                              bool                    GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
 
 /*! \brief
  * Attempts to complete PME GPU tasks.
@@ -426,13 +429,15 @@ GPU_FUNC_QUALIFIER bool
  * \param[in]  wcycle          The wallclock counter.
  * \param[out] forceWithVirial The output force and virial
  * \param[out] enerd           The output energies
+ * \param[in]  useGpuFPmeReduction  Whether PME forces are reduced on GPU
  */
 GPU_FUNC_QUALIFIER void
     pme_gpu_wait_and_reduce(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
                             int                   GPU_FUNC_ARGUMENT(flags),
                             gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle),
                             gmx::ForceWithVirial *GPU_FUNC_ARGUMENT(forceWithVirial),
-                            gmx_enerdata_t       *GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+                            gmx_enerdata_t       *GPU_FUNC_ARGUMENT(enerd),
+                            bool                  GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
 
 /*! \brief
  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
@@ -451,6 +456,22 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG
                                                    gmx_wallcycle   *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
 
 
-/*! \brief Get pointer to device copy of coordinate data. */
+/*! \brief Get pointer to device copy of coordinate data.
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to coordinate data
+ */
 GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to device copy of force data.
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to sychronizer
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
 #endif
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 07d15cdf1a..cd1ba956c4 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -249,7 +249,8 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
 
 void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
                            gmx_wallcycle gmx_unused        *wcycle,
-                           PmeForceOutputHandling           forceTreatment)
+                           PmeForceOutputHandling           forceTreatment,
+                           bool                             useGpuFPmeReduction)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
@@ -262,7 +263,7 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
     const unsigned int gridIndex  = 0;
     real              *fftgrid    = pme->fftgrid[gridIndex];
-    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
+    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid), useGpuFPmeReduction);
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
@@ -286,7 +287,8 @@ static void pme_gpu_reduce_outputs(const int             flags,
                                    const PmeOutput      &output,
                                    gmx_wallcycle        *wcycle,
                                    gmx::ForceWithVirial *forceWithVirial,
-                                   gmx_enerdata_t       *enerd)
+                                   gmx_enerdata_t       *enerd,
+                                   bool                  useGpuFPmeReduction)
 {
     wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
@@ -297,7 +299,10 @@ static void pme_gpu_reduce_outputs(const int             flags,
         forceWithVirial->addVirialContribution(output.coulombVirial_);
         enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
     }
-    sum_forces(forceWithVirial->force_, output.forces_);
+    if (!useGpuFPmeReduction)
+    {
+        sum_forces(forceWithVirial->force_, output.forces_);
+    }
     wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
 }
 
@@ -344,7 +349,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t            *pme,
     PmeOutput output = pme_gpu_getOutput(*pme, flags);
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 
-    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, false);
 
     return true;
 }
@@ -357,7 +362,9 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t     *pme,
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
     wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
     // Synchronize the whole PME stream at once, including D2H result transfers.
+    // TODO: make this sync conditional with useGpuFPmeReduction to wait only for virial/energies
     pme_gpu_synchronize(pme->gpu);
 
     pme_gpu_update_timings(pme->gpu);
@@ -371,10 +378,11 @@ void pme_gpu_wait_and_reduce(gmx_pme_t            *pme,
                              const int             flags,
                              gmx_wallcycle        *wcycle,
                              gmx::ForceWithVirial *forceWithVirial,
-                             gmx_enerdata_t       *enerd)
+                             gmx_enerdata_t       *enerd,
+                             bool                  useGpuFPmeReduction)
 {
     PmeOutput output = pme_gpu_wait_finish_task(pme, flags, wcycle);
-    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, useGpuFPmeReduction);
 }
 
 void pme_gpu_reinit_computation(const gmx_pme_t *pme,
@@ -400,3 +408,22 @@ void *pme_gpu_get_device_x(const gmx_pme_t *pme)
     }
     return pme_gpu_get_kernelparam_coordinates(pme->gpu);
 }
+
+void *pme_gpu_get_device_f(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+    return pme_gpu_get_kernelparam_forces(pme->gpu);
+}
+
+GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+
+    return pme_gpu_get_forces_ready_synchronizer(pme->gpu);
+}
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index cad028b557..70c74c900c 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -239,7 +239,6 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina
     // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
     //        Remove after adding the required event-based sync between the above H2D and the transform kernel
     pme_gpu_synchronize(pmeGpu);
-
 #endif
 }
 
@@ -1207,7 +1206,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
 
 void pme_gpu_gather(PmeGpu                *pmeGpu,
                     PmeForceOutputHandling forceTreatment,
-                    const float           *h_grid
+                    const float           *h_grid,
+                    bool                   useGpuFPmeReduction
                     )
 {
     /* Copying the input CPU forces for reduction */
@@ -1269,7 +1269,14 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
     pme_gpu_stop_timing(pmeGpu, timingId);
 
-    pme_gpu_copy_output_forces(pmeGpu);
+    if (useGpuFPmeReduction)
+    {
+        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+    }
+    else
+    {
+        pme_gpu_copy_output_forces(pmeGpu);
+    }
 }
 
 void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
@@ -1282,5 +1289,28 @@ void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
     {
         return nullptr;
     }
+}
 
+void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu && pmeGpu->kernelParams)
+    {
+        return pmeGpu->kernelParams->atoms.d_forces;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu && pmeGpu->kernelParams)
+    {
+        return &pmeGpu->archSpecific->pmeForcesReady;
+    }
+    else
+    {
+        return nullptr;
+    }
 }
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index e22fe64592..19d243af41 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -437,14 +437,31 @@ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu)
  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
  *                                 TODO: determine efficiency/balance of host/device-side reductions.
  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
+ * \param[in]     useGpuFPmeReduction Whether forces are reduced on GPU
  */
 GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(pmeGpu),
                                        PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
-                                       const float           *GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+                                       const float           *GPU_FUNC_ARGUMENT(h_grid),
+                                       bool                   GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
 
-/*! \brief Return pointer to device copy of coordinate data. */
+/*! \brief Return pointer to device copy of coordinate data.
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to coordinate data
+ */
 GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
+/*! \brief Return pointer to device copy of force data.
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to sync object
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
 /* The inlined convenience PME GPU status getters */
 
 /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index 6949bde60f..be865c8778 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -79,6 +79,8 @@ struct PmeGpuSpecific
     Context context;
 
     /* Synchronization events */
+    /*! \brief Triggered after the PME Force Calculations have been completed */
+    GpuEventSynchronizer pmeForcesReady;
     /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
     GpuEventSynchronizer syncSpreadGridD2H;
 
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 6f8ebe1723..e3e948a856 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -626,7 +626,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
             pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
-            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set, false);
             output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
             pme_gpu_reinit_computation(pme, wcycle);
         }
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 55515294d2..763d79d682 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -418,7 +418,7 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
             {
                 std::copy(std::begin(forces), std::end(forces), std::begin(output.forces_));
             }
-            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid), false);
             std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
         }
         break;
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 11963c0de6..0c53b40c4d 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1,4 +1,4 @@
-/*
+/* x
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
@@ -623,12 +623,14 @@ static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
  *
  * \param[in]  pmedata        The PME structure
  * \param[in]  wcycle         The wallcycle structure
+ * \param[in]  useGpuFPmeReduction Whether forces will be reduced on GPU
  */
 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
-                                     gmx_wallcycle_t   wcycle)
+                                     gmx_wallcycle_t   wcycle,
+                                     bool              useGpuFPmeReduction)
 {
     pme_gpu_launch_complex_transforms(pmedata, wcycle);
-    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set, useGpuFPmeReduction);
 }
 
 /*! \brief
@@ -695,9 +697,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
                 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                              as_rvec_array(force->unpaddedArrayRef().data()),
-                                              BufferOpsUseGpu::False,
-                                              GpuBufferOpsAccumulateForce::Null);
+                                              as_rvec_array(force->unpaddedArrayRef().data()));
             }
         }
     }
@@ -907,12 +907,18 @@ void do_force(FILE                                     *fplog,
         ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
-    const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
-        && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
-        BufferOpsUseGpu::True : BufferOpsUseGpu::False;
 
+    // Switches on whether to use GPU for position and force buffer operations
+    // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
     const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ?
         BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+    // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
+    const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+        && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
+        BufferOpsUseGpu::True : BufferOpsUseGpu::False;
+    // TODO: move / add this flag to the internal PME GPU data structures
+    const bool useGpuFPmeReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+        thisRankHasDuty(cr, DUTY_PME) && useGpuPme; // only supported if this rank is perfoming PME on the GPU
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1141,7 +1147,7 @@ void do_force(FILE                                     *fplog,
         // X copy/transform to allow overlap as well as after the GPU NB
         // launch to avoid FFT launch overhead hijacking the CPU and delaying
         // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle, useGpuFPmeReduction);
     }
 
     /* Communicate coordinates and sum dipole if necessary +
@@ -1327,9 +1333,7 @@ void do_force(FILE                                     *fplog,
          * communication with calculation with domain decomposition.
          */
         wallcycle_stop(wcycle, ewcFORCE);
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f(),
-                                      BufferOpsUseGpu::False,
-                                      GpuBufferOpsAccumulateForce::Null);
+        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f());
 
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
@@ -1365,20 +1369,8 @@ void do_force(FILE                                     *fplog,
                          flags, &forceOut.forceWithVirial(), enerd,
                          ed, bNS);
 
-    // flag to specify if CPU force output is preset in force
-    // buffer. For now, this is true even when useGpuPme == true
-    // (because on-GPU PME-nonbonded reduction will be added in
-    // follow-up)
-    // TODO adapt the below when on-GPU PME-nonbonded reduction is available.
-    bool                   useCpuPmeReduction = true;
-    bool                   haveCpuForces      = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction);
-    // flag to specify if forces should be accumulated in force buffer
-    // ops. For now, this is solely determined by above haveCpuForces
-    // flag, but in future developments it will also depend on
-    // e.g. whether the GPU force halo exchange is active.
-    GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-        haveCpuForces ? GpuBufferOpsAccumulateForce::True :
-        GpuBufferOpsAccumulateForce::False;
+    bool                   useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuFPmeReduction;
+    bool                   haveCpuForces       = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuFPmeReduction);
 
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
@@ -1410,8 +1402,15 @@ void do_force(FILE                                     *fplog,
             {
                 nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
             }
+
+            // flag to specify if forces should be accumulated in force buffer
+            // ops. For non-local part, this just depends on whether CPU forces are present.
+            bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
             nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceOut.f(), useGpuFBufOps, accumulateForce);
+                                          forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                          pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                          useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
                 nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
@@ -1456,7 +1455,7 @@ void do_force(FILE                                     *fplog,
 
     if (!alternateGpuWait && useGpuPme)
     {
-        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
+        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd, useGpuFPmeReduction);
     }
 
     /* Wait for local GPU NB outputs on the non-alternating wait path */
@@ -1515,12 +1514,22 @@ void do_force(FILE                                     *fplog,
         // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
         //   before the next CPU task that consumes the forces: vsite spread or update)
         //
-        if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+        if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr)))
         {
             nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
         }
+        // flag to specify if forces should be accumulated in force
+        // buffer ops. For local part, this depends on whether CPU
+        // forces are present, or if DD is active (in which case the
+        // halo exchange has resulted in contributions from the
+        // non-local part).
+        bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+            (haveCpuForces || DOMAINDECOMP(cr));
         nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      forceOut.f(), useGpuFBufOps, accumulateForce);
+                                      forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                      pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                      useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
             nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 8288a1d227..dd0ccb2ce7 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1441,12 +1441,15 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
 template <bool  useGpu>
-void reduceForces(nbnxn_atomdata_t                   *nbat,
-                  const Nbnxm::AtomLocality           locality,
-                  const Nbnxm::GridSet               &gridSet,
-                  rvec                               *f,
-                  gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  GpuBufferOpsAccumulateForce         accumulateForce)
+void reduceForces(nbnxn_atomdata_t                *nbat,
+                  const Nbnxm::AtomLocality        locality,
+                  const Nbnxm::GridSet            &gridSet,
+                  rvec                            *f,
+                  void                            *pmeFDeviceBuffer,
+                  GpuEventSynchronizer            *pmeForcesReady,
+                  gmx_nbnxn_gpu_t                 *gpu_nbv,
+                  bool                             useGpuFPmeReduction,
+                  bool                             accumulateForce)
 {
     int a0 = 0;
     int na = 0;
@@ -1463,9 +1466,11 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
     {
         Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
                                          gpu_nbv,
+                                         pmeFDeviceBuffer,
+                                         pmeForcesReady,
                                          a0, na,
+                                         useGpuFPmeReduction,
                                          accumulateForce);
-
     }
     else
     {
@@ -1510,16 +1515,22 @@ void reduceForces<true>(nbnxn_atomdata_t             *nbat,
                         const Nbnxm::AtomLocality     locality,
                         const Nbnxm::GridSet         &gridSet,
                         rvec                         *f,
+                        void                         *fpme,
+                        GpuEventSynchronizer         *pmeForcesReady,
                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                        GpuBufferOpsAccumulateForce   accumulateForce);
+                        bool                          useGpuFPmeReduction,
+                        bool                          accumulateForce);
 
 template
 void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                          const Nbnxm::AtomLocality     locality,
                          const Nbnxm::GridSet         &gridSet,
                          rvec                         *f,
+                         void                         *fpme,
+                         GpuEventSynchronizer         *pmeForcesReady,
                          gmx_nbnxn_gpu_t              *gpu_nbv,
-                         GpuBufferOpsAccumulateForce   accumulateForce);
+                         bool                          useGpuFPmeReduction,
+                         bool                          accumulateForce);
 
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
                                               rvec                   *fshift)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index d627732014..e34c8bd558 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -59,14 +59,14 @@ struct tMPI_Atomic;
 
 enum class BufferOpsUseGpu;
 
+class GpuEventSynchronizer;
+
 namespace Nbnxm
 {
 class GridSet;
 enum class KernelType;
 }
 
-enum class GpuBufferOpsAccumulateForce;
-
 /* Convenience type for vector with aligned memory */
 template<typename T>
 using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
@@ -344,8 +344,11 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
                   Nbnxm::AtomLocality                 locality,
                   const Nbnxm::GridSet               &gridSet,
                   rvec                               *f,
+                  void                               *pmeFDeviceBuffer,
+                  GpuEventSynchronizer               *pmeForcesReady,
                   gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  GpuBufferOpsAccumulateForce         accumulateForce);
+                  bool                                useGpuFPmeReduction,
+                  bool                                accumulateForce);
 
 
 extern template
@@ -353,16 +356,22 @@ void reduceForces<true>(nbnxn_atomdata_t             *nbat,
                         const Nbnxm::AtomLocality     locality,
                         const Nbnxm::GridSet         &gridSet,
                         rvec                         *f,
+                        void                         *pmeFDeviceBuffer,
+                        GpuEventSynchronizer         *pmeForcesReady,
                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                        GpuBufferOpsAccumulateForce   accumulateForce);
+                        bool                          useGpuFPmeReduction,
+                        bool                          accumulateForce);
 
 extern template
 void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                          const Nbnxm::AtomLocality     locality,
                          const Nbnxm::GridSet         &gridSet,
                          rvec                         *f,
+                         void                         *pmeFDeviceBuffer,
+                         GpuEventSynchronizer         *pmeForcesReady,
                          gmx_nbnxn_gpu_t              *gpu_nbv,
-                         GpuBufferOpsAccumulateForce   accumulateForce);
+                         bool                          useGpuFPmeReduction,
+                         bool                          accumulateForce);
 
 /* Add the fshift force stored in nbat to fshift */
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
index 09d6e37243..bc0d2c2c5e 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -139,24 +139,26 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
 
 /*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
  *
- * \param[in]     fnb     Force in nbat format
- * \param[in,out] f       Force buffer to be reduced into
- * \param[in]     cell    Cell index mapping
- * \param[in]     a0      start atom index
- * \param[in]     a1      end atom index
- * \param[in]     stride  stride between atoms in memory
+ * \param[in]     fnb              Force in nbat format
+ * \param[in]     fPmeDeviceBuffer PME force
+ * \param[in,out] f                Force buffer to be reduced into
+ * \param[in]     cell             Cell index mapping
+ * \param[in]     atomStart        Start atom index
+ * \param[in]     nAtoms           Number of Atoms
  */
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
 __global__ void
 nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 rvec                     * f,
+                                 const float3 *__restrict__ fPmeDeviceBuffer,
+                                 float3                   * f,
                                  const int *__restrict__    cell,
                                  const int                  atomStart,
                                  const int                  nAtoms);
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
 __global__ void
 nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 rvec                     * f,
+                                 const float3 *__restrict__ fPmeDeviceBuffer,
+                                 float3                   * f,
                                  const int *__restrict__    cell,
                                  const int                  atomStart,
                                  const int                  nAtoms)
@@ -170,16 +172,23 @@ nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
     {
 
         int     i        = cell[atomStart+threadIndex];
-        float3 *f_dest   = (float3 *)&f[atomStart+threadIndex][XX];
+        float3 *fDest    = (float3 *)&f[atomStart+threadIndex];
+        float3  temp;
 
         if (accumulateForce)
         {
-            *f_dest += fnb[i];
+            temp  = *fDest;
+            temp += fnb[i];
         }
         else
         {
-            *f_dest = fnb[i];
+            temp = fnb[i];
         }
+        if (addPmeF)
+        {
+            temp += fPmeDeviceBuffer[atomStart+threadIndex];
+        }
+        *fDest = temp;
 
     }
     return;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index ec16b1b01c..74925428ac 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -54,6 +54,7 @@
 #include "nbnxm_cuda.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/nbnxm/atomdata.h"
@@ -851,18 +852,27 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
 }
 
 /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
-                               gmx_nbnxn_gpu_t                    *nb,
-                               int                                 atomStart,
-                               int                                 nAtoms,
-                               GpuBufferOpsAccumulateForce         accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
+                               gmx_nbnxn_gpu_t                 *nb,
+                               void                            *fPmeDevicePtr,
+                               GpuEventSynchronizer            *pmeForcesReady,
+                               int                              atomStart,
+                               int                              nAtoms,
+                               bool                             useGpuFPmeReduction,
+                               bool                             accumulateForce)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
     const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
     cudaStream_t              stream    = nb->stream[iLocality];
+    cu_atomdata_t            *adat      = nb->atdat;
+    bool                      addPmeF   = useGpuFPmeReduction;
 
-    cu_atomdata_t            *adat    = nb->atdat;
+    if (addPmeF)
+    {
+        //Stream must wait for PME force completion
+        pmeForcesReady->enqueueWaitEvent(stream);
+    }
 
     /* launch kernel */
 
@@ -876,16 +886,27 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
     config.sharedMemorySize = 0;
     config.stream           = stream;
 
-    auto              kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ?
-        nbnxn_gpu_add_nbat_f_to_f_kernel<true> : nbnxn_gpu_add_nbat_f_to_f_kernel<false>;
-    const float3     *fPtr                    = adat->f;
-    rvec             *frvec                   = nb->frvec;
-    const int        *cell                    = nb->cell;
+    auto  kernelFn = accumulateForce ?
+        nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
+        nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
+
+    if (addPmeF)
+    {
+        kernelFn = accumulateForce ?
+            nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
+            nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
+    }
+
+    const float3     *d_f     = adat->f;
+    float3           *d_fNB   = (float3*) nb->frvec;
+    const float3     *d_fPme  = (float3*) fPmeDevicePtr;
+    const int        *d_cell  = nb->cell;
 
     const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
-                                                               &fPtr,
-                                                               &frvec,
-                                                               &cell,
+                                                               &d_f,
+                                                               &d_fPme,
+                                                               &d_fNB,
+                                                               &d_cell,
                                                                &atomStart,
                                                                &nAtoms);
 
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 77119044cb..dfe8c4e936 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -158,17 +158,43 @@ gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
     return pairSearch_->gridSet().cells();
 }
 
+void
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
+                                             rvec                               *f)
+{
+
+    /* Skip the reduction if there was no short-range GPU work to do
+     * (either NB or both NB and bonded work). */
+    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    {
+        return;
+    }
+
+    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+
+    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), f, nullptr, nullptr, gpu_nbv, false, false);
+
+    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+}
+
 void
 nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
                                              rvec                               *f,
+                                             void                               *fPmeDeviceBuffer,
+                                             GpuEventSynchronizer               *pmeForcesReady,
                                              BufferOpsUseGpu                     useGpu,
-                                             GpuBufferOpsAccumulateForce         accumulateForce)
+                                             bool                                useGpuFPmeReduction,
+                                             bool                                accumulateForce)
 {
 
-    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
-                 (accumulateForce == GpuBufferOpsAccumulateForce::True)),
+    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce),
                "Accumulatation of force is only valid when GPU buffer ops are active");
 
+    GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)),
+               "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available");
+
     /* Skip the reduction if there was no short-range GPU work to do
      * (either NB or both NB and bonded work). */
     if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
@@ -180,7 +206,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
     auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
-    fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
+    fn(nbat.get(), locality, pairSearch_->gridSet(), f, fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index a749b074b9..cf0b2f62ab 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -135,14 +135,7 @@ enum class BufferOpsUseGpu
     False
 };
 
-/*! \brief Switch for whether forces should accumulate in GPU buffer ops */
-enum class GpuBufferOpsAccumulateForce
-{
-    True,  // Force should be accumulated and format converted
-    False, // Force should be not accumulated, just format converted
-    Null   // GPU buffer ops are not in use, so this object is not applicable
-};
-
+class GpuEventSynchronizer;
 
 namespace gmx
 {
@@ -310,11 +303,29 @@ struct nonbonded_verlet_t
                                       int                         forceFlags,
                                       t_nrnb                     *nrnb);
 
-        //! Add the forces stored in nbat to f, zeros the forces in nbat */
+        /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat
+         * \param [in] locality         Local or non-local
+         * \param [inout] f             Force to be added to
+         */
+        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
+                                      rvec                               *f);
+
+        /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
+         * \param [in] locality         Local or non-local
+         * \param [inout] f             Force to be added to
+         * \param [in] fPme             Force from PME calculation
+         * \param [in] pmeForcesReady   Event triggered when PME force calculation has completed
+         * \param [in] useGpu           Whether GPU buffer ops are active
+         * \param [in] useGpuFPmeReduction   Whether PME force reduction is on GPU
+         * \param [in] accumulateForce  Whether force should be accumulated or stored
+         */
         void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
                                       rvec                               *f,
+                                      void                               *fPme,
+                                      GpuEventSynchronizer               *pmeForcesReady,
                                       BufferOpsUseGpu                     useGpu,
-                                      GpuBufferOpsAccumulateForce         accumulateForce);
+                                      bool                                useGpuFPmeReduction,
+                                      bool                                accumulateForce);
 
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index b213ff4c07..0f0e8de218 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -54,7 +54,6 @@
 
 struct nbnxn_atomdata_t;
 enum class GpuTaskCompletion;
-enum class GpuBufferOpsAccumulateForce;
 
 namespace gmx
 {
@@ -280,9 +279,12 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality           gmx_unused  atomLocality,
                                gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
+                               void                         gmx_unused *fPmeDevicePtr,
+                               GpuEventSynchronizer         gmx_unused *pmeForcesReady,
                                int                          gmx_unused  atomStart,
                                int                          gmx_unused  nAtoms,
-                               GpuBufferOpsAccumulateForce     gmx_unused  accumulateForce) CUDA_FUNC_TERM;
+                               bool                         gmx_unused  useGpuFPmeReduction,
+                               bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
 /*! \brief Copy force buffer from CPU to GPU */
 CUDA_FUNC_QUALIFIER
-- 
2.22.0