From: Alan Gray <alangray3@gmail.com>
Date: Wed, 6 Oct 2021 09:26:43 +0000 (+0000)
Subject: Rework GPU halo and state propagator streams and dependencies to get better overlap
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ccd5bef5f2afb57b15efcdacd9b2b3fedbd5edef;p=alexxy%2Fgromacs.git

Rework GPU halo and state propagator streams and dependencies to get better overlap
---

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 8a634b7c07..f063f3b0e7 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3205,14 +3205,7 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
         {
             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd,
-                    d,
-                    cr.mpi_comm_mygroup,
-                    deviceStreamManager.context(),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
-                    pulse,
-                    wcycle));
+                    cr.dd, d, cr.mpi_comm_mygroup, deviceStreamManager.context(), pulse, wcycle));
         }
     }
 }
@@ -3230,26 +3223,31 @@ void reinitGpuHaloExchange(const t_commrec&              cr,
     }
 }
 
-void communicateGpuHaloCoordinates(const t_commrec&      cr,
-                                   const matrix          box,
-                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                                    const matrix          box,
+                                                    GpuEventSynchronizer* dependencyEvent)
 {
+    GpuEventSynchronizer* eventPtr = dependencyEvent;
     for (int d = 0; d < cr.dd->ndim; d++)
     {
         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
         {
-            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+            eventPtr = cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, eventPtr);
         }
     }
+    return eventPtr;
 }
 
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+void communicateGpuHaloForces(const t_commrec&                                    cr,
+                              bool                                                accumulateForces,
+                              gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
 {
     for (int d = cr.dd->ndim - 1; d >= 0; d--)
     {
         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
         {
-            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
+            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces, dependencyEvents);
+            dependencyEvents->push_back(cr.dd->gpuHaloExchange[d][pulse]->getForcesReadyOnDeviceEvent());
         }
     }
 }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index bb964de98e..c7d7483cbe 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -92,6 +92,8 @@ class RangePartitioning;
 class VirtualSitesHandler;
 template<typename>
 class ArrayRef;
+template<typename, size_t>
+class FixedCapacityVector;
 } // namespace gmx
 
 /*! \brief Returns the global topology atom number belonging to local atom index i.
@@ -265,20 +267,24 @@ void reinitGpuHaloExchange(const t_commrec&        cr,
 
 
 /*! \brief GPU halo exchange of coordinates buffer.
- * \param [in] cr                             The commrec object
- * \param [in] box                            Coordinate box (from which shifts will be constructed)
- * \param [in] coordinatesReadyOnDeviceEvent  event recorded when coordinates have been copied to device
+ * \param [in] cr                The commrec object
+ * \param [in] box               Coordinate box (from which shifts will be constructed)
+ * \param [in] dependencyEvent   Dependency event for this operation
+ * \returns                      Event recorded when this operation has been launched
  */
-void communicateGpuHaloCoordinates(const t_commrec&      cr,
-                                   const matrix          box,
-                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
-
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                                    const matrix          box,
+                                                    GpuEventSynchronizer* dependencyEvent);
 
-/*! \brief GPU halo exchange of force buffer.
- * \param [in] cr                The commrec object
+/*! \brief  Wait for copy of nonlocal part of coordinate array from GPU to CPU
+ * following coordinate halo exchange
+ * \param [in] cr   The commrec object
  * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+ * \param [in] dependencyEvents  Dependency events for this operation
  */
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+void communicateGpuHaloForces(const t_commrec&                                    cr,
+                              bool                                                accumulateForces,
+                              gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
 
 /*! \brief Wraps the \c positions so that atoms from the same
  * update group share the same periodic image wrt \c box.
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index adc8d66712..431407b6bc 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -46,7 +46,9 @@
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/fixedcapacityvector.h"
 #include "gromacs/utility/gmxmpi.h"
 
 struct gmx_domdec_t;
@@ -66,27 +68,26 @@ class GpuHaloExchange
 public:
     /*! \brief Creates GPU Halo Exchange object.
      *
-     * Coordinate Halo exchange will be performed in \c
-     * StreamNonLocal, and the \c communicateHaloCoordinates
-     * method must be called before any subsequent operations that
-     * access non-local parts of the coordinate buffer (such as
-     * the non-local non-bonded kernels). It also must be called
-     * after the local coordinates buffer operations (where the
-     * coordinates are copied to the device and hence the \c
-     * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
-     * will be performed in \c streamNonLocal and the \c
-     * communicateHaloForces method must be called after the
-     * non-local buffer operations, after the local force buffer
-     * has been copied to the GPU (if CPU forces are present), and
-     * before the local buffer operations. The force halo exchange
-     * does not yet support virial steps.
+     * Coordinate Halo exchange will be performed in its own stream
+     * with appropriate event-based synchronization, and the \c
+     * communicateHaloCoordinates method must be called before any
+     * subsequent operations that access non-local parts of the
+     * coordinate buffer (such as the non-local non-bonded
+     * kernels). It also must be called after the local coordinates
+     * buffer operations (where the coordinates are copied to the
+     * device and hence the \c coordinatesReadyOnDeviceEvent is
+     * recorded). Force Halo exchange will also be performed in its
+     * own stream with appropriate event-based synchronization, and
+     * the \c communicateHaloForces method must be called after the
+     * non-local buffer operations, after the local force buffer has
+     * been copied to the GPU (if CPU forces are present), and before
+     * the local buffer operations. The force halo exchange does not
+     * yet support virial steps.
      *
      * \param [inout] dd                       domdec structure
      * \param [in]    dimIndex                 the dimension index for this instance
      * \param [in]    mpi_comm_mysim           communicator used for simulation
      * \param [in]    deviceContext            GPU device context
-     * \param [in]    streamLocal              local NB CUDA stream.
-     * \param [in]    streamNonLocal           non-local NB CUDA stream.
      * \param [in]    pulse                    the communication pulse for this instance
      * \param [in]    wcycle                   The wallclock counter
      */
@@ -94,8 +95,6 @@ public:
                     int                  dimIndex,
                     MPI_Comm             mpi_comm_mysim,
                     const DeviceContext& deviceContext,
-                    const DeviceStream&  streamLocal,
-                    const DeviceStream&  streamNonLocal,
                     int                  pulse,
                     gmx_wallcycle*       wcycle);
     ~GpuHaloExchange();
@@ -116,15 +115,18 @@ public:
      * Must be called after local setCoordinates (which records an
      * event when the coordinate data has been copied to the
      * device).
-     * \param [in] box  Coordinate box (from which shifts will be constructed)
-     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+     * \param [in] box               Coordinate box (from which shifts will be constructed)
+     * \param [in] dependencyEvent   Dependency event for this operation
+     * \returns                      Event recorded when this operation has been launched
      */
-    void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
 
     /*! \brief GPU halo exchange of force buffer.
      * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param[in] dependencyEvents  Dependency events for this operation
      */
-    void communicateHaloForces(bool accumulateForces);
+    void communicateHaloForces(bool                                           accumulateForces,
+                               FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
 
     /*! \brief Get the event synchronizer for the forces ready on device.
      *  \returns  The event to synchronize the stream that consumes forces on device.
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index e00d4e3d60..5a839b27bf 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -67,8 +67,6 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                  int /* dimIndex */,
                                  MPI_Comm /* mpi_comm_mysim */,
                                  const DeviceContext& /* deviceContext */,
-                                 const DeviceStream& /*streamLocal */,
-                                 const DeviceStream& /*streamNonLocal */,
                                  int /*pulse */,
                                  gmx_wallcycle* /*wcycle*/) :
     impl_(nullptr)
@@ -96,16 +94,18 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
 }
 
 /*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
-                                                 GpuEventSynchronizer* /*coordinatesOnDeviceEvent*/)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+                                                                  GpuEventSynchronizer* /*dependencyEvent*/)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for GPU Halo Exchange exchange was called insted of the correct "
                "implementation.");
+    return nullptr;
 }
 
 /*!\brief apply F halo exchange stub. */
-void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool /* accumulateForces */,
+                                            FixedCapacityVector<GpuEventSynchronizer*, 2>* /*dependencyEvents*/)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 5796b96980..b0dab24e8a 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -200,7 +200,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
         std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
 
         copyToDeviceBuffer(
-                &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
+                &d_indexMap_, h_indexMap_.data(), 0, newSize, *haloStream_, GpuApiCallBehavior::Async, nullptr);
     }
 
 #if GMX_MPI
@@ -270,19 +270,16 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch
                  0,
                  mpi_comm_mysim_,
                  MPI_STATUS_IGNORE);
-    remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(*haloStream_);
 }
 
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
-                                                       GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
+                                                                        GpuEventSynchronizer* dependencyEvent)
 {
-
     wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
-    if (pulse_ == 0)
-    {
-        // ensure stream waits until coordinate data is available on device
-        coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
-    }
+
+    // ensure stream waits until dependency has been satisfied
+    dependencyEvent->enqueueWaitEvent(*haloStream_);
 
     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
 
@@ -318,8 +315,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
         const auto kernelArgs = prepareGpuKernelArguments(
                 kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift);
 
-        launchGpuKernel(
-                kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+        launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
     }
 
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
@@ -331,28 +327,41 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
 
     // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
     // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
-    if (GMX_THREAD_MPI && pulse_ == 0)
+    if (GMX_THREAD_MPI && dimIndex_ == 0 && pulse_ == 0)
     {
-        enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
+        enqueueWaitRemoteCoordinatesReadyEvent(dependencyEvent);
     }
 
     float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
     communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
 
+    coordinateHaloLaunched_.markEvent(*haloStream_);
+
     wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
+
+    return &coordinateHaloLaunched_;
 }
 
 // The following method should be called after non-local buffer operations,
-// and before the local buffer operations. It operates in the non-local stream.
-void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+// and before the local buffer operations.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces,
+                                                  FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
 {
+
     // Consider time spent in communicateHaloData as Comm.F counter
     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
     wallcycle_start(wcycle_, WallCycleCounter::MoveF);
 
+    while (dependencyEvents->size() > 0)
+    {
+        auto dependency = dependencyEvents->back();
+        dependency->enqueueWaitEvent(*haloStream_);
+        dependencyEvents->pop_back();
+    }
+
     float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
 
-    // Communicate halo data (in non-local stream)
+    // Communicate halo data
     communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
 
     wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
@@ -361,19 +370,6 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
 
     float3* d_f = d_f_;
-    // If this is the last pulse and index (noting the force halo
-    // exchanges across multiple pulses and indices are called in
-    // reverse order) then perform the following preparation
-    // activities
-    if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
-    {
-        // ensure non-local stream waits for local stream, due to dependence on
-        // the previous H2D copy of CPU forces (if accumulateForces is true)
-        // or local force clearing.
-        GpuEventSynchronizer eventLocal;
-        eventLocal.markEvent(localStream_);
-        eventLocal.enqueueWaitEvent(nonLocalStream_);
-    }
 
     // Unpack halo buffer into force array
 
@@ -405,14 +401,10 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
         const auto kernelArgs =
                 prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
 
-        launchGpuKernel(
-                kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+        launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
     }
 
-    if (pulse_ == 0)
-    {
-        fReadyOnDevice_.markEvent(nonLocalStream_);
-    }
+    fReadyOnDevice_.markEvent(*haloStream_);
 
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
@@ -447,12 +439,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
     // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
     if (sendSize > 0)
     {
-        // wait for non local stream to complete all outstanding
+        // wait for halo stream to complete all outstanding
         // activities, to ensure that buffer is up-to-date in GPU memory
         // before transferring to remote rank
 
         // ToDo: Replace stream synchronize with event synchronize
-        nonLocalStream_.synchronize();
+        haloStream_->synchronize();
     }
 
     // perform halo exchange directly in device buffers
@@ -491,7 +483,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
                                sendPtr,
                                sendSize * DIM * sizeof(float),
                                cudaMemcpyDeviceToDevice,
-                               nonLocalStream_.stream());
+                               haloStream_->stream());
 
         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
     }
@@ -506,7 +498,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
     GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
                "Halo exchange requires valid event to synchronize data transfer initiated in "
                "remote rank");
-    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+    haloDataTransferLaunched_->markEvent(*haloStream_);
 
     MPI_Sendrecv(&haloDataTransferLaunched_,
                  sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
@@ -521,7 +513,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
                  mpi_comm_mysim_,
                  MPI_STATUS_IGNORE);
 
-    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+    haloDataTransferRemote->enqueueWaitEvent(*haloStream_);
 #else
     GMX_UNUSED_VALUE(sendRank);
     GMX_UNUSED_VALUE(recvRank);
@@ -538,8 +530,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                             int                  dimIndex,
                             MPI_Comm             mpi_comm_mysim,
                             const DeviceContext& deviceContext,
-                            const DeviceStream&  localStream,
-                            const DeviceStream&  nonLocalStream,
                             int                  pulse,
                             gmx_wallcycle*       wcycle) :
     dd_(dd),
@@ -551,8 +541,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
     haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
     mpi_comm_mysim_(mpi_comm_mysim),
     deviceContext_(deviceContext),
-    localStream_(localStream),
-    nonLocalStream_(nonLocalStream),
+    haloStream_(new DeviceStream(deviceContext, DeviceStreamPriority::High, false)),
     dimIndex_(dimIndex),
     pulse_(pulse),
     wcycle_(wcycle)
@@ -580,11 +569,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                  int                  dimIndex,
                                  MPI_Comm             mpi_comm_mysim,
                                  const DeviceContext& deviceContext,
-                                 const DeviceStream&  localStream,
-                                 const DeviceStream&  nonLocalStream,
                                  int                  pulse,
                                  gmx_wallcycle*       wcycle) :
-    impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
+    impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, pulse, wcycle))
 {
 }
 
@@ -603,15 +590,16 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceB
     impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
 }
 
-void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
-                                                 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
+                                                                  GpuEventSynchronizer* dependencyEvent)
 {
-    impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+    return impl_->communicateHaloCoordinates(box, dependencyEvent);
 }
 
-void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces,
+                                            FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
 {
-    impl_->communicateHaloForces(accumulateForces);
+    impl_->communicateHaloForces(accumulateForces, dependencyEvents);
 }
 
 GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index 6190a56b95..c29834616a 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -75,8 +75,6 @@ public:
      * \param [in]    dimIndex                 the dimension index for this instance
      * \param [in]    mpi_comm_mysim           communicator used for simulation
      * \param [in]    deviceContext            GPU device context
-     * \param [in]    localStream              local NB CUDA stream
-     * \param [in]    nonLocalStream           non-local NB CUDA stream
      * \param [in]    pulse                    the communication pulse for this instance
      * \param [in]    wcycle                   The wallclock counter
      */
@@ -84,8 +82,6 @@ public:
          int                  dimIndex,
          MPI_Comm             mpi_comm_mysim,
          const DeviceContext& deviceContext,
-         const DeviceStream&  localStream,
-         const DeviceStream&  nonLocalStream,
          int                  pulse,
          gmx_wallcycle*       wcycle);
     ~Impl();
@@ -101,14 +97,17 @@ public:
     /*! \brief
      * GPU halo exchange of coordinates buffer
      * \param [in] box  Coordinate box (from which shifts will be constructed)
-     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+     * \param [in] dependencyEvent   Dependency event for this operation
+     * \returns                      Event recorded when this operation has been launched
      */
-    void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
 
     /*! \brief  GPU halo exchange of force buffer
-     * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param [in] dependencyEvents  Dependency events for this operation
      */
-    void communicateHaloForces(bool accumulateForces);
+    void communicateHaloForces(bool                                           accumulateForces,
+                               FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
 
     /*! \brief Get the event synchronizer for the forces ready on device.
      *  \returns  The event to synchronize the stream that consumes forces on device.
@@ -150,8 +149,8 @@ private:
                                         int     recvSize,
                                         int     recvRank);
 
-    /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local
-     * stream \param [in] eventSync    event recorded when coordinates/forces are ready to device
+    /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in halo stream
+     * \param [in] eventSync    event recorded when coordinates/forces are ready to device
      */
     void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
@@ -211,10 +210,8 @@ private:
     MPI_Comm mpi_comm_mysim_;
     //! GPU context object
     const DeviceContext& deviceContext_;
-    //! CUDA stream for local non-bonded calculations
-    const DeviceStream& localStream_;
-    //! CUDA stream for non-local non-bonded calculations
-    const DeviceStream& nonLocalStream_;
+    //! CUDA stream for this halo exchange
+    DeviceStream* haloStream_;
     //! full coordinates buffer in GPU memory
     float3* d_x_ = nullptr;
     //! full forces buffer in GPU memory
@@ -229,6 +226,8 @@ private:
     gmx_wallcycle* wcycle_ = nullptr;
     //! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance
     int atomOffset_ = 0;
+    //! Event triggered when coordinate halo has been launched
+    GpuEventSynchronizer coordinateHaloLaunched_;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/domdec/tests/haloexchange_mpi.cpp b/src/gromacs/domdec/tests/haloexchange_mpi.cpp
index 1f1138fb36..63659998ef 100644
--- a/src/gromacs/domdec/tests/haloexchange_mpi.cpp
+++ b/src/gromacs/domdec/tests/haloexchange_mpi.cpp
@@ -150,8 +150,8 @@ void gpuHalo(gmx_domdec_t* dd, matrix box, HostVector<RVec>* h_x, int numAtomsTo
     {
         for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
         {
-            gpuHaloExchange[d].push_back(GpuHaloExchange(
-                    dd, d, MPI_COMM_WORLD, deviceContext, deviceStream, deviceStream, pulse, nullptr));
+            gpuHaloExchange[d].push_back(
+                    GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr));
         }
     }
 
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index ee3e8cdd46..241cc3b9fc 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1132,12 +1132,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
                             || runScheduleWork->simulationWork.havePpDomainDecomposition;
     const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
-                                                            nbv->getNumAtoms(AtomLocality::Local),
-                                                            nbv->getGridIndices(),
-                                                            atomStart,
-                                                            accumulate,
-                                                            stateGpu->fReducedOnDevice());
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+            stateGpu->getForces(),
+            nbv->getNumAtoms(AtomLocality::Local),
+            nbv->getGridIndices(),
+            atomStart,
+            accumulate,
+            stateGpu->fReducedOnDevice(AtomLocality::Local));
 
     // register forces and add dependencies
     fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
@@ -1174,15 +1175,12 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
         }
     }
 
-    if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
+    if (runScheduleWork->domainWork.haveCpuLocalForceWork
+        || (runScheduleWork->simulationWork.havePpDomainDecomposition
+            && !runScheduleWork->simulationWork.useGpuHaloExchange))
     {
-        // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
-        if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
-        {
-            const bool useGpuForceBufferOps = true;
-            fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps));
-        }
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+                stateGpu->fReadyOnDevice(AtomLocality::Local));
     }
 
     if (runScheduleWork->simulationWork.useGpuHaloExchange)
@@ -1197,16 +1195,23 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
         const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
                                 || runScheduleWork->domainWork.haveFreeEnergyWork;
         const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
-                                                                   nbv->getNumAtoms(AtomLocality::NonLocal),
-                                                                   nbv->getGridIndices(),
-                                                                   atomStart,
-                                                                   accumulate);
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+                stateGpu->getForces(),
+                nbv->getNumAtoms(AtomLocality::NonLocal),
+                nbv->getGridIndices(),
+                atomStart,
+                accumulate,
+                stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
 
         // register forces and add dependencies
-        // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
         fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
                 Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+        if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+        {
+            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+                    stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
+        }
     }
 }
 
@@ -1266,6 +1271,17 @@ void do_force(FILE*                               fplog,
     runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
     const StepWorkload& stepWork = runScheduleWork->stepWork;
 
+    if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+    {
+        // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+        // First clear local portion of force array, so that untouched atoms are zero.
+        // The dependency for this is that forces from previous timestep have been consumed,
+        // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+        stateGpu->clearForcesOnGpu(AtomLocality::Local,
+                                   stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                           AtomLocality::Local, simulationWork, stepWork));
+    }
+
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
      * decomposition (and not during the previous step as usual).
@@ -1579,16 +1595,18 @@ void do_force(FILE*                               fplog,
         }
         else
         {
+            GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
             if (stepWork.useGpuXHalo)
             {
                 // The following must be called after local setCoordinates (which records an event
                 // when the coordinate data has been copied to the device).
-                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+                gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
 
                 if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                 {
                     // non-local part of coordinate buffer must be copied back to host for CPU work
-                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+                    stateGpu->copyCoordinatesFromGpu(
+                            x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
                 }
             }
             else
@@ -1608,10 +1626,11 @@ void do_force(FILE*                               fplog,
                 {
                     stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                 }
-                nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
-                                           stateGpu->getCoordinates(),
-                                           stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                   AtomLocality::NonLocal, simulationWork, stepWork));
+                nbv->convertCoordinatesGpu(
+                        AtomLocality::NonLocal,
+                        stateGpu->getCoordinates(),
+                        stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
             }
             else
             {
@@ -2085,13 +2104,11 @@ void do_force(FILE*                               fplog,
             {
                 // If there exist CPU forces, data from halo exchange should accumulate into these
                 bool accumulateForces = domainWork.haveCpuLocalForceWork;
-                if (!accumulateForces)
-                {
-                    // Force halo exchange will set a subset of local atoms with remote non-local data
-                    // First clear local portion of force array, so that untouched atoms are zero
-                    stateGpu->clearForcesOnGpu(AtomLocality::Local);
-                }
-                communicateGpuHaloForces(*cr, accumulateForces);
+                gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+                gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+                gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+                communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
             }
             else
             {
@@ -2232,15 +2249,7 @@ void do_force(FILE*                               fplog,
             //   These should be unified.
             if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
             {
-                // Note: AtomLocality::All is used for the non-DD case because, as in this
-                // case copyForcesToGpu() uses a separate stream, it allows overlap of
-                // CPU force H2D with GPU force tasks on all streams including those in the
-                // local stream which would otherwise be implicit dependencies for the
-                // transfer and would not overlap.
-                auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
-                                                                         : AtomLocality::All;
-
-                stateGpu->copyForcesToGpu(forceWithShift, locality);
+                stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
             }
 
             if (stepWork.computeNonbondedForces)
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 970bcd6781..85c29398e0 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1522,18 +1522,17 @@ void gmx::LegacySimulator::do_md()
                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
 
                 // This applies Leap-Frog, LINCS and SETTLE in succession
-                integrator->integrate(
-                        stateGpu->getForcesReadyOnDeviceEvent(
-                                AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                        ir->delta_t,
-                        true,
-                        bCalcVir,
-                        shake_vir,
-                        doTemperatureScaling,
-                        ekind->tcstat,
-                        doParrinelloRahman,
-                        ir->nstpcouple * ir->delta_t,
-                        M);
+                integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+                                              runScheduleWork->stepWork, runScheduleWork->simulationWork),
+                                      ir->delta_t,
+                                      true,
+                                      bCalcVir,
+                                      shake_vir,
+                                      doTemperatureScaling,
+                                      ekind->tcstat,
+                                      doParrinelloRahman,
+                                      ir->nstpcouple * ir->delta_t,
+                                      M);
 
                 // Copy velocities D2H after update if:
                 // - Globals are computed this step (includes the energy output steps).
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 9e8dc576a6..ad07df2c13 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -188,15 +188,17 @@ public:
      * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
      * returned synchronizer indicates that the copy is complete.
      *
-     *  \param[in] atomLocality    Locality of the particles to wait for.
-     *  \param[in] simulationWork  The simulation lifetime flags.
-     *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] atomLocality              Locality of the particles to wait for.
+     *  \param[in] simulationWork            The simulation lifetime flags.
+     *  \param[in] stepWork                  The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
      *
      *  \returns  The event to synchronize the stream that consumes coordinates on device.
      */
     GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                            const SimulationWorkload& simulationWork,
-                                                           const StepWorkload&       stepWork);
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
 
     /*! \brief Blocking wait until coordinates are copied to the device.
      *
@@ -212,12 +214,15 @@ public:
      */
     void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
 
-    /*! \brief Copy positions from the GPU memory.
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
      *
      *  \param[in] h_x           Positions buffer in the host memory.
      *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
      */
-    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
 
     /*! \brief Wait until coordinates are available on the host.
      *
@@ -271,8 +276,9 @@ public:
     /*! \brief Clear forces in the GPU memory.
      *
      *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
      */
-    void clearForcesOnGpu(AtomLocality atomLocality);
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
 
     /*! \brief Get the event synchronizer for the forces ready on device.
      *
@@ -281,20 +287,27 @@ public:
      *  1. The forces are copied to the device (when GPU buffer ops are off)
      *  2. The forces are reduced on the device (GPU buffer ops are on)
      *
-     *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
-     *
-     *  \param[in] atomLocality      Locality of the particles to wait for.
-     *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
      *
      *  \returns  The event to synchronize the stream that consumes forces on device.
      */
-    GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
 
     /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
      *
-     *  \returns  The event to mark when forces are reduced on the GPU.
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready on the GPU.
      */
-    GpuEventSynchronizer* fReducedOnDevice();
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
 
     /*! \brief Copy forces from the GPU memory.
      *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index d4fc488720..618856d0ce 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -103,7 +103,8 @@ DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
         AtomLocality /* atomLocality */,
         const SimulationWorkload& /* simulationWork */,
-        const StepWorkload& /* stepWork       */)
+        const StepWorkload& /* stepWork       */,
+        GpuEventSynchronizer* /* gpuCoordinateHaloLaunched */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -141,14 +142,14 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality /* atomLoca
 }
 
 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_x          */,
-                                                    AtomLocality /* atomLocality */)
+                                                    AtomLocality /* atomLocality */,
+                                                    GpuEventSynchronizer* /*dependency */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
 }
 
-
 DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 {
     GMX_ASSERT(!impl_,
@@ -197,15 +198,24 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
                "GPU implementation.");
 }
 
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */,
+                                              GpuEventSynchronizer* /* dependency */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload /* stepWork */,
+                                                                               SimulationWorkload /* simulationWork */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
+    return nullptr;
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
-                                                                          bool /* useGpuFBufferOps */)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*atomLocality*/)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -213,7 +223,7 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLo
     return nullptr;
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index 4353eecf4e..dbe435d508 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -175,12 +175,14 @@ public:
      *  \param[in] atomLocality    Locality of the particles to wait for.
      *  \param[in] simulationWork  The simulation lifetime flags.
      *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
      *
      *  \returns  The event to synchronize the stream that consumes coordinates on device.
      */
     GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                            const SimulationWorkload& simulationWork,
-                                                           const StepWorkload&       stepWork);
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
 
     /*! \brief Blocking wait until coordinates are copied to the device.
      *
@@ -196,12 +198,15 @@ public:
      */
     void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
 
-    /*! \brief Copy positions from the GPU memory.
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
      *
      *  \param[in] h_x           Positions buffer in the host memory.
      *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
      */
-    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
 
     /*! \brief Wait until coordinates are available on the host.
      *
@@ -253,8 +258,9 @@ public:
     /*! \brief Clear forces in the GPU memory.
      *
      *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
      */
-    void clearForcesOnGpu(AtomLocality atomLocality);
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
 
     /*! \brief Get the event synchronizer for the forces ready on device.
      *
@@ -263,20 +269,27 @@ public:
      *  1. The forces are copied to the device (when GPU buffer ops are off)
      *  2. The forces are reduced on the device (GPU buffer ops are on)
      *
-     *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
-     *
-     *  \param[in] atomLocality      Locality of the particles to wait for.
-     *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
      *
      *  \returns  The event to synchronize the stream that consumes forces on device.
      */
-    GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
 
-    /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+    /*! \brief Getter for the event synchronizer for when forces are reduced on the GPU.
      *
-     *  \returns  The event to mark when forces are reduced on the GPU.
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready for GPU update.
      */
-    GpuEventSynchronizer* fReducedOnDevice();
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
 
     /*! \brief Copy forces from the GPU memory.
      *
@@ -327,6 +340,9 @@ private:
     EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
     // Streams to use for forces H2D and D2H copies (one event for each atom locality)
     EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
+    // Streams internal to this module
+    std::unique_ptr<DeviceStream> copyInStream_;
+    std::unique_ptr<DeviceStream> memsetStream_;
 
     /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
      *
@@ -343,8 +359,8 @@ private:
 
     //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
     EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
-    //! An event that the forces were reduced on the GPU
-    GpuEventSynchronizer fReducedOnDevice_;
+    //! An array of events that indicate the forces were reduced on the GPU (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> fReducedOnDevice_;
     //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
     EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
 
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index aa63099ffa..30478ef046 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -95,6 +95,9 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage
     fCopyStreams_[AtomLocality::Local]    = localStream_;
     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
     fCopyStreams_[AtomLocality::All]      = updateStream_;
+
+    copyInStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
+    memsetStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
 }
 
 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
@@ -340,23 +343,28 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-GpuEventSynchronizer*
-StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                               const SimulationWorkload& simulationWork,
-                                                               const StepWorkload&       stepWork)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(
+        AtomLocality              atomLocality,
+        const SimulationWorkload& simulationWork,
+        const StepWorkload&       stepWork,
+        GpuEventSynchronizer*     gpuCoordinateHaloLaunched)
 {
     // The provider of the coordinates may be different for local atoms. If the update is offloaded
     // and this is not a neighbor search step, then the consumer needs to wait for the update
     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
-    // the copy event. Non-local coordinates are always provided by the H2D copy.
-    //
-    // TODO: This should be reconsidered to support the halo exchange.
+    // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
     //
     // In OpenCL no events are used as coordinate sync is not necessary
     if (GMX_GPU_OPENCL)
     {
         return nullptr;
     }
+    if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
+    {
+        GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
+                   "GPU halo exchange is active but its completion event is null.");
+        return gpuCoordinateHaloLaunched;
+    }
     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
     {
         GMX_ASSERT(xUpdatedOnDeviceEvent_ != nullptr, "The event synchronizer can not be nullptr.");
@@ -382,7 +390,9 @@ void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer
     xUpdatedOnDeviceEvent_ = xUpdatedOnDeviceEvent;
 }
 
-void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                                          AtomLocality             atomLocality,
+                                                          GpuEventSynchronizer*    dependency)
 {
     GMX_ASSERT(atomLocality < AtomLocality::All,
                formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
@@ -393,6 +403,11 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVe
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying positions with given atom locality.");
 
+    if (dependency != nullptr)
+    {
+        dependency->enqueueWaitEvent(*deviceStream);
+    }
+
     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
@@ -476,11 +491,13 @@ DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
     return d_f_;
 }
 
+// Copy CPU forces to GPU using stream internal to this module to allow overlap
+// with GPU force calculations.
 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
                                                    AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    DeviceStream* deviceStream = copyInStream_.get();
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying forces with given atom locality.");
 
@@ -494,10 +511,14 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    DeviceStream* deviceStream = memsetStream_.get();
+
+    GMX_ASSERT(dependency != nullptr, "Dependency is not valid for clearing forces.");
+    dependency->enqueueWaitEvent(*deviceStream);
+
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for clearing forces with given atom locality.");
 
@@ -506,26 +527,33 @@ void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
 
     clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
 
+    fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
+
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                                                bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+                                                                                     SimulationWorkload simulationWork)
 {
-    if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
+    if (stepWork.useGpuFBufferOps && !simulationWork.useCpuPmePpCommunication)
     {
-        return &fReducedOnDevice_;
+        return &fReducedOnDevice_[AtomLocality::Local];
     }
     else
     {
-        return &fReadyOnDevice_[atomLocality];
+        return &fReadyOnDevice_[AtomLocality::Local];
     }
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocality atomLocality)
+{
+    return &fReducedOnDevice_[atomLocality];
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
 {
-    return &fReducedOnDevice_;
+    return &fReadyOnDevice_[atomLocality];
 }
 
 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
@@ -617,9 +645,11 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx:
 GpuEventSynchronizer*
 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                          const SimulationWorkload& simulationWork,
-                                                         const StepWorkload&       stepWork)
+                                                         const StepWorkload&       stepWork,
+                                                         GpuEventSynchronizer* gpuCoordinateHaloLaunched)
 {
-    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
+    return impl_->getCoordinatesReadyOnDeviceEvent(
+            atomLocality, simulationWork, stepWork, gpuCoordinateHaloLaunched);
 }
 
 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
@@ -632,9 +662,11 @@ void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpd
     impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
 }
 
-void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>   h_x,
+                                                    AtomLocality          atomLocality,
+                                                    GpuEventSynchronizer* dependency)
 {
-    return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+    return impl_->copyCoordinatesFromGpu(h_x, atomLocality, dependency);
 }
 
 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
@@ -675,20 +707,25 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     return impl_->copyForcesToGpu(h_f, atomLocality);
 }
 
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
+{
+    return impl_->clearForcesOnGpu(atomLocality, dependency);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+                                                                               SimulationWorkload simulationWork)
 {
-    return impl_->clearForcesOnGpu(atomLocality);
+    return impl_->getLocalForcesReadyOnDeviceEvent(stepWork, simulationWork);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                                          bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atomLocality)
 {
-    return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
+    return impl_->fReducedOnDevice(atomLocality);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
 {
-    return impl_->fReducedOnDevice();
+    return impl_->fReadyOnDevice(atomLocality);
 }
 
 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)