From 54c2472992b082d43bc020a0165d47a64165991e Mon Sep 17 00:00:00 2001
From: Alan Gray <alang@nvidia.com>
Date: Tue, 3 Sep 2019 01:00:13 -0700
Subject: [PATCH] GPU Force Halo Exchange

Activate with GMX_GPU_DD_COMMS environment variable.

Extends GPU Halo exchange feature to provide GPU Force halo exchange
functionality. Does not yet support virial steps, which require an
extra shift force reduction - these are currently performed on the
non-buffer ops / non direct-comm path. Also has same limitations as
coordinate halo exchange.

Performs part of #2890. Future work to improve synchronization towards
a more one-sided scheme (#3092) and to make depenencies more
explicit (#3093)

Change-Id: Ifc23cc8db2655f7258e68b34e7cdc7b71994e1e8
---
 src/gromacs/domdec/gpuhaloexchange.h        |  36 ++++--
 src/gromacs/domdec/gpuhaloexchange_impl.cpp |   9 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cu  | 126 ++++++++++++++++++--
 src/gromacs/domdec/gpuhaloexchange_impl.cuh |  29 +++--
 src/gromacs/mdlib/sim_util.cpp              |  52 ++++++--
 src/gromacs/mdrun/runner.cpp                |   8 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu        |  15 +++
 src/gromacs/nbnxm/nbnxm.cpp                 |  10 ++
 src/gromacs/nbnxm/nbnxm.h                   |   6 +
 src/gromacs/nbnxm/nbnxm_gpu.h               |  12 ++
 10 files changed, 258 insertions(+), 45 deletions(-)
 mode change 100755 => 100644 src/gromacs/domdec/gpuhaloexchange_impl.cpp

diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index 58f908b76a..4187e45cc2 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -60,22 +60,31 @@ class GpuHaloExchange
     public:
         /*! \brief Creates GPU Halo Exchange object.
          *
-         * Halo exchange will be performed in \c streamNonLocal, and
-         * the main communicateHaloCoordinates method must be called
-         * before any subsequent operations that access non-local
-         * parts of the coordinate buffer (such as the non-local
-         * non-bonded kernels). It also must be called after the local
-         * coordinates buffer operations (where the coordinates are
-         * copied to the device and hence the \c
-         * coordinatesOnDeviceEvent is recorded).
+         * Coordinate Halo exchange will be performed in \c
+         * StreamNonLocal, and the \c communicateHaloCoordinates
+         * method must be called before any subsequent operations that
+         * access non-local parts of the coordinate buffer (such as
+         * the non-local non-bonded kernels). It also must be called
+         * after the local coordinates buffer operations (where the
+         * coordinates are copied to the device and hence the \c
+         * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+         * will be performed in \c streamNonLocal (also potentally
+         * with buffer clearing in \c streamLocal)and the \c
+         * communicateHaloForces method must be called after the
+         * non-local buffer operations, after the local force buffer
+         * has been copied to the GPU (if CPU forces are present), and
+         * before the local buffer operations. The force halo exchange
+         * does not yet support virial steps.
          *
          * \param [inout] dd                       domdec structure
          * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    streamLocal              local NB CUDA stream.
          * \param [in]    streamNonLocal           non-local NB CUDA stream.
          * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
          */
         GpuHaloExchange(gmx_domdec_t *dd,
                         MPI_Comm      mpi_comm_mysim,
+                        void         *streamLocal,
                         void         *streamNonLocal,
                         void         *coordinatesOnDeviceEvent);
         ~GpuHaloExchange();
@@ -84,9 +93,10 @@ class GpuHaloExchange
          *
          * Initialization for GPU halo exchange of coordinates buffer
          * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
+         * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
          */
-        void reinitHalo(rvec *d_coordinateBuffer);
-
+        void reinitHalo(rvec        *d_coordinateBuffer,
+                        rvec        *d_forcesBuffer);
 
         /*! \brief GPU halo exchange of coordinates buffer.
          *
@@ -97,6 +107,12 @@ class GpuHaloExchange
          */
         void communicateHaloCoordinates(const matrix box);
 
+        /*! \brief GPU halo exchange of force buffer.
+         * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+         */
+        void communicateHaloForces(bool accumulateForces);
+
+
     private:
         class Impl;
         gmx::PrivateImplPointer<Impl> impl_;
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
old mode 100755
new mode 100644
index 4b491eac36..741c79519c
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -62,6 +62,7 @@ class GpuHaloExchange::Impl
 /*!\brief Constructor stub. */
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
                                  MPI_Comm       /* mpi_comm_mysim */,
+                                 void         * /*streamLocal */,
                                  void         * /*streamNonLocal */,
                                  void         * /*coordinatesOnDeviceEvent*/)
     : impl_(nullptr)
@@ -72,7 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
 GpuHaloExchange::~GpuHaloExchange() = default;
 
 /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
+                                 rvec * /* d_forcesBuffer */)
 {
     GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
 }
@@ -83,6 +85,11 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
     GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
 }
 
+/*!\brief apply F halo exchange stub. */
+void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
 
 }      // namespace gmx
 
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 4e8ba35dd4..0b024fb0bd 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -37,7 +37,7 @@
  * \brief Implements GPU halo exchange using CUDA.
  *
  *
- * \author Alan Gray <alang@nvidia.com.com>
+ * \author Alan Gray <alang@nvidia.com>
  *
  * \ingroup module_domdec
  */
@@ -95,10 +95,44 @@ __global__ void packSendBufKernel(float3 * __restrict__       dataPacked,
     return;
 }
 
-void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
+/*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index information
+ * \param[out] data        full array of force values
+ * \param[in]  dataPacked  packed array of force values to be transferred
+ * \param[in]  map         array of indices defining mapping from full to packed array
+ * \param[in]  mapSize     number of elements in map array
+ */
+template <bool accumulate>
+__global__ void unpackRecvBufKernel(float3 * __restrict__       data,
+                                    const float3 * __restrict__ dataPacked,
+                                    const int * __restrict__    map,
+                                    const int                   mapSize)
+{
+
+    int           threadIndex           = blockIdx.x*blockDim.x+threadIdx.x;
+    const float3 *gm_dataSrc            = &dataPacked[threadIndex];
+    float3       *gm_dataDest           = &data[map[threadIndex]];
+
+    if (threadIndex < mapSize)
+    {
+        if (accumulate)
+        {
+            *gm_dataDest += *gm_dataSrc;
+        }
+        else
+        {
+            *gm_dataDest = *gm_dataSrc;
+        }
+    }
+
+    return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer,
+                                       float3      *d_forcesBuffer)
 {
 
     d_x_ = d_coordinatesBuffer;
+    d_f_ = d_forcesBuffer;
 
     cudaStream_t                  stream            = nonLocalStream_;
     int                           nzone             = 1;
@@ -128,7 +162,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
     fSendSize_ = xRecvSize_;
     fRecvSize_ = xSendSize_;
 
-    localOffset_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
+    numHomeAtoms_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
 
     GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
     std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
@@ -141,7 +175,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
     // since the pointers will not change until the next NS step.
 
     //Coordinates buffer:
-    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[numHomeAtoms_]);
     MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
                  &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
                  mpi_comm_mysim_, MPI_STATUS_IGNORE);
@@ -208,6 +242,58 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
     return;
 }
 
+// The following method should be called after non-local buffer operations,
+// and before the local buffer operations. It operates in the non-local stream.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+{
+
+    // Communicate halo data (in non-local stream)
+    communicateHaloData(d_f_, HaloQuantity::HaloForces);
+
+    float3            *d_f            = d_f_;
+
+    if (!accumulateForces)
+    {
+        //Clear local portion of force array (in local stream)
+        cudaMemsetAsync(d_f, 0, numHomeAtoms_*sizeof(rvec), localStream_);
+    }
+
+    // ensure non-local stream waits for local stream, due to dependence on
+    // the previous H2D copy of CPU forces (if accumulateForces is true)
+    // or the above clearing.
+    // TODO remove this dependency on localStream - edmine issue #3093
+    GpuEventSynchronizer eventLocal;
+    eventLocal.markEvent(localStream_);
+    eventLocal.enqueueWaitEvent(nonLocalStream_);
+
+    //Unpack halo buffer into force array
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (fRecvSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = nonLocalStream_;
+
+    const float3    *recvBuf    = d_recvBuf_;
+    const int       *indexMap   = d_indexMap_;
+    const int        size       = fRecvSize_;
+
+    if (size > 0)
+    {
+        auto             kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
+
+        const auto       kernelArgs   = prepareGpuKernelArguments(kernelFn, config, &d_f,
+                                                                  &recvBuf, &indexMap,
+                                                                  &size);
+
+        launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+    }
+}
+
 
 void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
                                                 HaloQuantity haloQuantity)
@@ -218,6 +304,7 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
     void * remotePtr;
     int    sendRank;
     int    recvRank;
+
     if (haloQuantity == HaloQuantity::HaloCoordinates)
     {
         sendPtr   = static_cast<void*> (d_sendBuf_);
@@ -225,10 +312,16 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
         remotePtr = remoteXPtr_;
         sendRank  = sendRankX_;
         recvRank  = recvRankX_;
+
+        //Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
+        char thisTaskIsReady, remoteTaskIsReady;
+        MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0,
+                     &remoteTaskIsReady, sizeof(char), MPI_BYTE, sendRank, 0,
+                     mpi_comm_mysim_, MPI_STATUS_IGNORE);
     }
     else
     {
-        sendPtr   = static_cast<void*> (&(d_ptr[localOffset_]));
+        sendPtr   = static_cast<void*> (&(d_ptr[numHomeAtoms_]));
         sendSize  = fSendSize_;
         remotePtr = remoteFPtr_;
         sendRank  = sendRankF_;
@@ -238,7 +331,6 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
     communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
 }
 
-
 void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
                                                               int   sendSize,
                                                               int   sendRank,
@@ -247,6 +339,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
 {
 
     cudaError_t  stat;
+    cudaStream_t stream = nonLocalStream_;
 
     // We asynchronously push data to remote rank. The remote
     // destination pointer has already been set in the init fn.  We
@@ -257,7 +350,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
     // send data to neighbor, if any data exists to send
     if (sendSize > 0)
     {
-        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, stream);
         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
     }
 
@@ -267,19 +360,20 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
     // its stream.
     GpuEventSynchronizer *haloDataTransferRemote;
 
-    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+    haloDataTransferLaunched_->markEvent(stream);
 
     MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
                  &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
                  mpi_comm_mysim_, MPI_STATUS_IGNORE);
 
-    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+    haloDataTransferRemote->enqueueWaitEvent(stream);
 
 }
 
 /*! \brief Create Domdec GPU object */
 GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
                             MPI_Comm      mpi_comm_mysim,
+                            void        * localStream,
                             void        * nonLocalStream,
                             void        * coordinatesOnDeviceEvent)
     : dd_(dd),
@@ -290,6 +384,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
       usePBC_(dd->ci[dd->dim[0]] == 0),
       haloDataTransferLaunched_(new GpuEventSynchronizer()),
       mpi_comm_mysim_(mpi_comm_mysim),
+      localStream_(*static_cast<cudaStream_t*> (localStream)),
       nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
       coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
 {
@@ -323,17 +418,19 @@ GpuHaloExchange::Impl::~Impl()
 
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
                                  MPI_Comm      mpi_comm_mysim,
+                                 void         *localStream,
                                  void         *nonLocalStream,
                                  void         *coordinatesOnDeviceEvent)
-    : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
 {
 }
 
 GpuHaloExchange::~GpuHaloExchange() = default;
 
-void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer)
+void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer,
+                                 rvec        *d_forcesBuffer)
 {
-    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
 }
 
 void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
@@ -341,4 +438,9 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
     impl_->communicateHaloCoordinates(box);
 }
 
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+{
+    impl_->communicateHaloForces(accumulateForces);
+}
+
 } //namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index bd125654ee..10d9118927 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -68,11 +68,13 @@ class GpuHaloExchange::Impl
          *
          * \param [inout] dd                       domdec structure
          * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    localStream              local NB CUDA stream
          * \param [in]    nonLocalStream           non-local NB CUDA stream
          * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
          */
         Impl(gmx_domdec_t *dd,
              MPI_Comm mpi_comm_mysim,
+             void *localStream,
              void *nonLocalStream,
              void *coordinatesOnDeviceEvent);
         ~Impl();
@@ -80,8 +82,10 @@ class GpuHaloExchange::Impl
         /*! \brief
          * (Re-) Initialization for GPU halo exchange
          * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+         * \param [in] d_forcesBuffer   pointer to forces buffer in GPU memory
          */
-        void reinitHalo(float3 *d_coordinatesBuffer);
+        void reinitHalo(float3 *d_coordinatesBuffer,
+                        float3 *d_forcesBuffer);
 
 
         /*! \brief
@@ -90,6 +94,11 @@ class GpuHaloExchange::Impl
          */
         void communicateHaloCoordinates(const matrix box);
 
+        /*! \brief  GPU halo exchange of force buffer
+         * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+         */
+        void communicateHaloForces(bool accumulateForces);
+
     private:
 
         /*! \brief Data transfer wrapper for GPU halo exchange
@@ -106,11 +115,11 @@ class GpuHaloExchange::Impl
          * \param [inout] remotePtr  remote address to recv data
          * \param [in] recvRank      rank to recv data from
          */
-        void communicateHaloDataWithCudaDirect(void *sendPtr,
-                                               int   sendSize,
-                                               int   sendRank,
-                                               void* remotePtr,
-                                               int   recvRank);
+        void communicateHaloDataWithCudaDirect(void        *sendPtr,
+                                               int          sendSize,
+                                               int          sendRank,
+                                               void       * remotePtr,
+                                               int          recvRank);
 
         //! Domain decomposition object
         gmx_domdec_t               *dd_                       = nullptr;
@@ -152,8 +161,8 @@ class GpuHaloExchange::Impl
         int                         fSendSize_                = 0;
         //! recv copy size to this rank for F
         int                         fRecvSize_                = 0;
-        //! offset of local halo region
-        int                         localOffset_              = 0;
+        //! number of home atoms - offset of local halo region
+        int                         numHomeAtoms_             = 0;
         //! remote GPU coordinates buffer pointer for pushing data
         void                       *remoteXPtr_               = 0;
         //! remote GPU force buffer pointer for pushing data
@@ -166,12 +175,16 @@ class GpuHaloExchange::Impl
         GpuEventSynchronizer       *haloDataTransferLaunched_ = nullptr;
         //! MPI communicator used for simulation
         MPI_Comm                    mpi_comm_mysim_;
+        //! CUDA stream for local non-bonded calculations
+        cudaStream_t                localStream_              = nullptr;
         //! CUDA stream for non-local non-bonded calculations
         cudaStream_t                nonLocalStream_           = nullptr;
         //! Event triggered when coordinates have been copied to device
         GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
         //! full coordinates buffer in GPU memory
         float3                     *d_x_                      = nullptr;
+        //! full forces buffer in GPU memory
+        float3                     *d_f_                      = nullptr;
 
 };
 
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index aba304222f..881b9974ac 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1210,7 +1210,8 @@ void do_force(FILE                                     *fplog,
             if (ddUsesGpuDirectCommunication)
             {
                 rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
-                gpuHaloExchange->reinitHalo(d_x);
+                rvec* d_f    = static_cast<rvec *> (nbv->get_gpu_frvec());
+                gpuHaloExchange->reinitHalo(d_x, d_f);
             }
         }
         else
@@ -1519,6 +1520,12 @@ void do_force(FILE                                     *fplog,
         }
     }
 
+    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
+    const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+                                         (fr->efep != efepNO));
+
     if (havePPDomainDecomposition(cr))
     {
         /* We are done with the CPU compute.
@@ -1530,11 +1537,27 @@ void do_force(FILE                                     *fplog,
 
         if (forceFlags.computeForces)
         {
-            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            gmx::ArrayRef<gmx::RVec>  force  = forceOut.forceWithShiftForces().force();
+            rvec                     *f      = as_rvec_array(force.data());
+
+            if (useGpuForcesHaloExchange)
             {
-                nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                if (haveCpuLocalForces)
+                {
+                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                }
+                bool accumulateHaloForces = haveCpuLocalForces;
+                gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
             }
-            dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+            else
+            {
+                if (useGpuFBufOps == BufferOpsUseGpu::True)
+                {
+                    nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                }
+                dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+            }
+
         }
     }
 
@@ -1604,12 +1627,6 @@ void do_force(FILE                                     *fplog,
     {
         gmx::ArrayRef<gmx::RVec>  forceWithShift = forceOut.forceWithShiftForces().force();
 
-
-        const bool useCpuPmeFReduction    = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-        // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
-        const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
-                                             (fr->efep != efepNO));
-
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
             // Flag to specify whether the CPU force buffer has contributions to
@@ -1622,12 +1639,23 @@ void do_force(FILE                                     *fplog,
             // - CPU f H2D should be as soon as all CPU-side forces are done
             // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
             //   before the next CPU task that consumes the forces: vsite spread or update)
-            //
+            // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
+            //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
+            //   These should be unified.
             rvec *f = as_rvec_array(forceWithShift.data());
-            if (haveLocalForceContribInCpuBuffer)
+            if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
             {
                 nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
             }
+            if (useGpuForcesHaloExchange)
+            {
+                // Add a stream synchronization to satisfy a dependency
+                // for the local buffer ops on the result of GPU halo
+                // exchange, which operates in the non-local stream and
+                // writes to to local parf og the force buffer.
+                // TODO improve this through use of an event - see Redmine #3093
+                nbv->stream_local_wait_for_nonlocal();
+            }
             nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
                                               nbv->getDeviceForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index acca236d2b..8d2f549c9d 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1337,11 +1337,15 @@ int Mdrunner::mdrunner()
 
         // TODO Move this to happen during domain decomposition setup,
         // once stream and event handling works well with that.
+        // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
         if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
         {
-            void *stream                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *streamLocal                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *streamNonLocal                =
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
-            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
+                                                                        streamNonLocal, coordinatesOnDeviceEvent);
         }
 
         /* Initialize the mdAtoms structure.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 9715d2985f..05c0278abc 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -1074,6 +1074,11 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
     return static_cast<void *> (gpu_nbv->xrvec);
 }
 
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return static_cast<void *> (gpu_nbv->frvec);
+}
+
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
 {
     return static_cast<void*> (nb->xAvailableOnDevice);
@@ -1084,4 +1089,14 @@ void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t   *nb)
     nb->xNonLocalCopyD2HDone->waitForEvent();
 }
 
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t   *nb)
+{
+    cudaStream_t         localStream     = nb->stream[InteractionLocality::Local];
+    cudaStream_t         nonLocalStream  = nb->stream[InteractionLocality::NonLocal];
+
+    GpuEventSynchronizer event;
+    event.markEvent(nonLocalStream);
+    event.enqueueWaitEvent(localStream);
+}
+
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 031033dea9..892098ae4a 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -330,4 +330,14 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
     Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
 }
 
+void* nonbonded_verlet_t::get_gpu_frvec()
+{
+    return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
+}
+
+void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
+{
+    Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
+}
+
 /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 7a27a5cb4e..caa21d9c7f 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -406,6 +406,12 @@ struct nonbonded_verlet_t
         /*! \brief Wait for non-local copy of coordinate buffer from device to host */
         void wait_nonlocal_x_copy_D2H_done();
 
+        /*! \brief return GPU pointer to f in rvec format */
+        void* get_gpu_frvec();
+
+        /*! \brief Ensure local stream waits for non-local stream */
+        void stream_local_wait_for_nonlocal();
+
         //! Return the kernel setup
         const Nbnxm::KernelSetup &kernelSetup() const
         {
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 743bcffa20..635b9d9790 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -411,5 +411,17 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WIT
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
+/*! \brief return GPU pointer to f in rvec format
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Ensure local stream waits for non-local stream
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
+
 } // namespace Nbnxm
 #endif
-- 
2.22.0