From: Alan Gray <alangray3@gmail.com>
Date: Thu, 14 Oct 2021 13:29:40 +0000 (+0000)
Subject: Pipeline GPU PME Spline/Spread with PP Comms
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=c088e63019ebc68760d35c3bc916015864aa8e89

Pipeline GPU PME Spline/Spread with PP Comms
---

diff --git a/docs/release-notes/2022/major/performance.rst b/docs/release-notes/2022/major/performance.rst
index 580ef631a3..247715ff23 100644
--- a/docs/release-notes/2022/major/performance.rst
+++ b/docs/release-notes/2022/major/performance.rst
@@ -26,3 +26,19 @@ up to a factor of 3.
 
 :issue:`2875`
 :issue:`742`
+
+       
+PME-PP GPU Direct Communication Pipelining
+""""""""""""""""""""""""""""""""""""""
+
+For multi-GPU runs with direct PME-PP GPU comunication enabled, the
+PME rank can now pipeline the coordinate transfers with computation in
+the PME Spread and Spline kernel (where the coordinates are
+consumed). The data from each transfer is handled seperately, allowing
+computation and communication to be overlapped. This is expected to
+have most benefit on systems where hardware communication interfaces
+are shared between multiple GPUs, e.g. PCIe within multi-GPU servers
+or Infiniband across multiple nodes.
+
+:issue:`3969`
+
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh
index af0e258ae9..d166b22653 100644
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 93145d4e1e..f8653e20dd 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -119,6 +119,7 @@ private:
     std::vector<std::string> reasons_;
 };
 
+class PmeCoordinateReceiverGpu;
 } // namespace gmx
 
 enum
@@ -373,17 +374,24 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t*     GPU_FUNC_ARGU
 /*! \brief
  * Launches first stage of PME on GPU - spreading kernel.
  *
- * \param[in] pme                The PME data structure.
- * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates
- * are ready in the device memory; nullptr allowed only on separate PME ranks.
- * \param[in] wcycle             The wallclock counter.
- * \param[in] lambdaQ            The Coulomb lambda of the current state of the
- * system. Only used if FEP of Coulomb is active.
+ * \param[in] pme                            The PME data structure.
+ * \param[in] xReadyOnDevice                 Event synchronizer indicating that the coordinates
+ *                                           are ready in the device memory; nullptr allowed only
+ *                                           on separate PME ranks.
+ * \param[in] wcycle                         The wallclock counter.
+ * \param[in] lambdaQ                        The Coulomb lambda of the current state of the
+ *                                           system. Only used if FEP of Coulomb is active.
+ * \param[in] useGpuDirectComm               Whether direct GPU PME-PP communication is active
+ * \param[in]  pmeCoordinateReceiverGpu      Coordinate receiver object, which must be valid when
+ *                                           direct GPU PME-PP communication is active
  */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t*            GPU_FUNC_ARGUMENT(pme),
-                                              GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                              gmx_wallcycle*        GPU_FUNC_ARGUMENT(wcycle),
-                                              real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(
+        gmx_pme_t*                     GPU_FUNC_ARGUMENT(pme),
+        GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
+        gmx_wallcycle*                 GPU_FUNC_ARGUMENT(wcycle),
+        real                           GPU_FUNC_ARGUMENT(lambdaQ),
+        const bool                     GPU_FUNC_ARGUMENT(useGpuDirectComm),
+        gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
 
 /*! \brief
  * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
index 81f640df40..d285c19954 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -49,6 +49,8 @@
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceStream;
+class DeviceContext;
+
 struct PpRanks;
 
 namespace gmx
@@ -62,18 +64,30 @@ class PmeCoordinateReceiverGpu
 
 public:
     /*! \brief Creates PME GPU coordinate receiver object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     *
+     * For multi-GPU runs, the PME GPU can receive coordinates from
+     * multiple PP GPUs. Data from these distinct communications can
+     * be handled separately in the PME spline/spread kernel, allowing
+     * pipelining which overlaps computation and communication. The
+     * class methods are designed to called seperately for each remote
+     * PP rank, and internally a different stream is used for each
+     * remote PP rank to allow overlapping.
+     *
      * \param[in] comm            Communicator used for simulation
+     * \param[in] deviceContext   GPU context
      * \param[in] ppRanks         List of PP ranks
      */
-    PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeCoordinateReceiverGpu(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<PpRanks> ppRanks);
     ~PmeCoordinateReceiverGpu();
 
     /*! \brief
+     * Re-initialize: set atom ranges and, for thread-MPI case,
      * send coordinates buffer address to PP rank
+     * This is required after repartitioning since atom ranges and
+     * buffer allocations may have changed.
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+    void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
 
 
     /*! \brief
@@ -92,10 +106,37 @@ public:
     void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
 
     /*! \brief
-     * For lib MPI, wait for coordinates from PP ranks
-     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     * For lib MPI, wait for coordinates from any PP rank
+     * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+     * rank determined from pipeline stage into given stream
+     * \param[in] pipelineStage  stage of pipeline corresponding to this transfer
+     * \param[in] deviceStream   stream in which to enqueue the wait event.
+     * \returns                  rank of sending PP task
+     */
+    int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+    /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+     * enqueueing all events to a single stream
+     * \param[in] deviceStream   stream in which to enqueue the wait events.
+     */
+    void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+    /*! \brief
+     * Return pointer to stream associated with specific PP rank sender index
+     * \param[in] senderIndex    Index of sender PP rank.
+     */
+    DeviceStream* ppCommStream(int senderIndex);
+
+    /*! \brief
+     * Returns range of atoms involved in communication associated with specific PP rank sender
+     * index \param[in] senderIndex    Index of sender PP rank.
+     */
+    std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+    /*! \brief
+     * Return number of PP ranks involved in PME-PP communication
      */
-    void synchronizeOnCoordinatesFromPpRanks();
+    int ppCommNumSenderRanks();
 
 private:
     class Impl;
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
index 4e997d319b..9d727d14b6 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -62,8 +62,8 @@ class PmeCoordinateReceiverGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
-                                                   MPI_Comm /* comm */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm /* comm */,
+                                                   const DeviceContext& /* deviceContext */,
                                                    gmx::ArrayRef<PpRanks> /* ppRanks */) :
     impl_(nullptr)
 {
@@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStr
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> /* d_x */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
@@ -99,13 +99,47 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe
                "implementation.");
 }
 
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int /* pipelineStage */,
+                                                                 const DeviceStream& /* deviceStream */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
+    return 0;
 }
 
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& /* deviceStream */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int /* senderIndex */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return nullptr;
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int /* senderIndex */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return std::make_tuple(0, 0);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return 0;
+}
+
+
 } // namespace gmx
 
 #endif // !GMX_GPU_CUDA
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
index 9de7c6676f..5b1fa48ade 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -56,33 +56,42 @@
 namespace gmx
 {
 
-PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
-                                     MPI_Comm               comm,
-                                     gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks)
+PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm                     comm,
+                                     const DeviceContext&         deviceContext,
+                                     gmx::ArrayRef<const PpRanks> ppRanks) :
+    comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext)
 {
-    request_.resize(ppRanks.size());
-    ppSync_.resize(ppRanks.size());
+    // Create streams to manage pipelining
+    ppCommManagers_.reserve(ppRanks.size());
+    for (auto& ppRank : ppRanks)
+    {
+        ppCommManagers_.emplace_back(PpCommManager{
+                ppRank,
+                std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false),
+                nullptr,
+                { 0, 0 } });
+    }
 }
 
 PmeCoordinateReceiverGpu::Impl::~Impl() = default;
 
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
 {
-    // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
-    if (GMX_THREAD_MPI)
+    int indEnd = 0;
+    for (auto& ppCommManager : ppCommManagers_)
     {
-        int ind_start = 0;
-        int ind_end   = 0;
-        for (const auto& receiver : ppRanks_)
-        {
-            ind_start = ind_end;
-            ind_end   = ind_start + receiver.numAtoms;
+        int indStart = indEnd;
+        indEnd       = indStart + ppCommManager.ppRank.numAtoms;
 
+        ppCommManager.atomRange = std::make_tuple(indStart, indEnd);
+
+        // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+        if (GMX_THREAD_MPI)
+        {
             // Data will be transferred directly from GPU.
-            void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+            void* sendBuf = reinterpret_cast<void*>(&d_x[indStart]);
 #if GMX_MPI
-            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_);
 #else
             GMX_UNUSED_VALUE(sendBuf);
 #endif
@@ -102,8 +111,13 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
 #if GMX_MPI
     // Receive event from PP task
     // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
-    recvCount_++;
+    MPI_Irecv(&ppCommManagers_[ppRank].sync,
+              sizeof(GpuEventSynchronizer*),
+              MPI_BYTE,
+              ppRank,
+              0,
+              comm_,
+              &(requests_[ppRank]));
 #else
     GMX_UNUSED_VALUE(ppRank);
 #endif
@@ -119,7 +133,7 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic
                "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
 
 #if GMX_MPI
-    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank]));
 #else
     GMX_UNUSED_VALUE(recvbuf);
     GMX_UNUSED_VALUE(numAtoms);
@@ -128,42 +142,70 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic
 #endif
 }
 
-void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage,
+                                                                       const DeviceStream& deviceStream)
 {
-    if (recvCount_ > 0)
-    {
-        // ensure PME calculation doesn't commence until coordinate data/remote events
-        // has been transferred
 #if GMX_MPI
-        MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
+    int senderRank = -1; // Rank of PP task that is associated with this invocation.
+#    if (!GMX_THREAD_MPI)
+    // Wait on data from any one of the PP sender GPUs
+    MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE);
+    GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater");
+    GMX_UNUSED_VALUE(pipelineStage);
+    GMX_UNUSED_VALUE(deviceStream);
+#    else
+    // MPI_Waitany is not available in thread-MPI. However, the
+    // MPI_Wait here is not associated with data but is host-side
+    // scheduling code to receive a CUDA event, and will be executed
+    // in advance of the actual data transfer. Therefore we can
+    // receive in order of pipeline stage, still allowing the
+    // scheduled GPU-direct comms to initiate out-of-order in their
+    // respective streams. For cases with CPU force computations, the
+    // scheduling is less asynchronous (done on a per-step basis), so
+    // host-side improvements should be investigated as tracked in
+    // issue #4047
+    senderRank = pipelineStage;
+    MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE);
+    ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream);
+#    endif
+    return senderRank;
 #endif
+}
 
-        // Make PME stream wait on PP to PME data trasnfer events
-        if (GMX_THREAD_MPI)
-        {
-            for (int i = 0; i < recvCount_; i++)
-            {
-                ppSync_[i]->enqueueWaitEvent(pmeStream_);
-            }
-        }
-
-        // reset receive counter
-        recvCount_ = 0;
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    for (int i = 0; i < static_cast<int>(ppCommManagers_.size()); i++)
+    {
+        synchronizeOnCoordinatesFromPpRank(i, deviceStream);
     }
 }
+DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].stream.get();
+}
 
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream&    pmeStream,
-                                                   MPI_Comm               comm,
+std::tuple<int, int> PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].atomRange;
+}
+
+int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks()
+{
+    return ppCommManagers_.size();
+}
+
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm               comm,
+                                                   const DeviceContext&   deviceContext,
                                                    gmx::ArrayRef<PpRanks> ppRanks) :
-    impl_(new Impl(pmeStream, comm, ppRanks))
+    impl_(new Impl(comm, deviceContext, ppRanks))
 {
 }
 
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
 {
-    impl_->sendCoordinateBufferAddressToPpRanks(d_x);
+    impl_->reinitCoordinateReceiver(d_x);
 }
 
 void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
@@ -179,9 +221,31 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe
     impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
 }
 
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int                 senderIndex,
+                                                                 const DeviceStream& deviceStream)
 {
-    impl_->synchronizeOnCoordinatesFromPpRanks();
+    return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream);
 }
 
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream);
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex)
+{
+    return impl_->ppCommStream(senderIndex);
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex)
+{
+    return impl_->ppCommAtomRange(senderIndex);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+    return impl_->ppCommNumSenderRanks();
+}
+
+
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
index 604079c0b0..d268091771 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -52,25 +52,41 @@ class GpuEventSynchronizer;
 
 namespace gmx
 {
-/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
 
+/*! \brief Object to manage communications with a specific PP rank */
+struct PpCommManager
+{
+    //! Details of PP rank that may be updated after repartitioning
+    const PpRanks& ppRank;
+    //! Stream used communication with for PP rank
+    std::unique_ptr<DeviceStream> stream;
+    //! Synchronization event to receive from PP rank
+    GpuEventSynchronizer* sync = nullptr;
+    //! Range of atoms corresponding to PP rank
+    std::tuple<int, int> atomRange = { 0, 0 };
+};
+
+/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
 class PmeCoordinateReceiverGpu::Impl
 {
 
 public:
     /*! \brief Creates PME GPU coordinate receiver object
-     * \param[in] pmeStream       CUDA stream used for PME computations
      * \param[in] comm            Communicator used for simulation
+     * \param[in] deviceContext   GPU context
      * \param[in] ppRanks         List of PP ranks
      */
-    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<const PpRanks> ppRanks);
     ~Impl();
 
     /*! \brief
-     * send coordinates buffer address to PP rank
+     * Re-initialize: set atom ranges and, for thread-MPI case,
+     * send coordinates buffer address to PP rank.
+     * This is required after repartitioning since atom ranges and
+     * buffer allocations may have changed.
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+    void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
 
     /*! \brief
      * Receive coordinate synchronizer pointer from the PP ranks.
@@ -88,24 +104,47 @@ public:
     void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
 
     /*! \brief
-     * For lib MPI, wait for coordinates from PP ranks
-     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     * For lib MPI, wait for coordinates from any PP rank
+     * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+     * rank determined from pipeline stage into given stream
+     * \param[in] pipelineStage  stage of pipeline corresponding to this transfer
+     * \param[in] deviceStream   stream in which to enqueue the wait event.
+     * \returns                  rank of sending PP task
+     */
+    int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+    /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+     * enqueueing all events to a single stream
+     * \param[in] deviceStream   stream in which to enqueue the wait events.
+     */
+    void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+    /*! \brief
+     * Return pointer to stream associated with specific PP rank sender index
+     * \param[in] senderIndex    Index of sender PP rank.
+     */
+    DeviceStream* ppCommStream(int senderIndex);
+
+    /*! \brief
+     * Returns range of atoms involved in communication associated with specific PP rank sender
+     * index \param[in] senderIndex    Index of sender PP rank.
+     */
+    std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+    /*! \brief
+     * Return number of PP ranks involved in PME-PP communication
      */
-    void synchronizeOnCoordinatesFromPpRanks();
+    int ppCommNumSenderRanks();
 
 private:
-    //! CUDA stream for PME operations
-    const DeviceStream& pmeStream_;
     //! communicator for simulation
     MPI_Comm comm_;
-    //! list of PP ranks
-    gmx::ArrayRef<PpRanks> ppRanks_;
-    //! vector of MPI requests
-    std::vector<MPI_Request> request_;
-    //! vector of synchronization events to receive from PP tasks
-    std::vector<GpuEventSynchronizer*> ppSync_;
-    //! counter of messages to receive
-    int recvCount_ = 0;
+    //! MPI requests, one per PP rank
+    std::vector<MPI_Request> requests_;
+    //! GPU context handle (not used in CUDA)
+    const DeviceContext& deviceContext_;
+    //! Communication manager objects corresponding to multiple sending PP ranks
+    std::vector<PpCommManager> ppCommManagers_;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 084bb3f6cc..680fcec0fc 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
 
 #include "pme_gpu_internal.h"
 #include "pme_gpu_settings.h"
@@ -189,10 +190,12 @@ void pme_gpu_prepare_computation(gmx_pme_t*               pme,
     }
 }
 
-void pme_gpu_launch_spread(gmx_pme_t*            pme,
-                           GpuEventSynchronizer* xReadyOnDevice,
-                           gmx_wallcycle*        wcycle,
-                           const real            lambdaQ)
+void pme_gpu_launch_spread(gmx_pme_t*                     pme,
+                           GpuEventSynchronizer*          xReadyOnDevice,
+                           gmx_wallcycle*                 wcycle,
+                           const real                     lambdaQ,
+                           const bool                     useGpuDirectComm,
+                           gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
     GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
@@ -215,7 +218,8 @@ void pme_gpu_launch_spread(gmx_pme_t*            pme,
     const bool spreadCharges  = true;
     wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
     wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
-    pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
+    pme_gpu_spread(
+            pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
     wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
     wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 96dc1f4db9..2f7da67a3a 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -75,6 +75,7 @@
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/stringutil.h"
 #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
 
 #if GMX_GPU_CUDA
 #    include "pme.cuh"
@@ -593,7 +594,10 @@ static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceCon
      */
 
 #if GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+    pmeGpu->kernelParams->usePipeline       = false;
+    pmeGpu->kernelParams->pipelineAtomStart = 0;
+    pmeGpu->kernelParams->pipelineAtomEnd   = 0;
+    pmeGpu->maxGridWidthX                   = deviceContext.deviceInfo().prop.maxGridSize[0];
 #else
     // Use this path for any non-CUDA GPU acceleration
     // TODO: is there no really global work size limit in OpenCL?
@@ -1276,12 +1280,14 @@ static auto selectSpreadKernelPtr(const PmeGpu*  pmeGpu,
     return kernelPtr;
 }
 
-void pme_gpu_spread(const PmeGpu*         pmeGpu,
-                    GpuEventSynchronizer* xReadyOnDevice,
-                    real**                h_grids,
-                    bool                  computeSplines,
-                    bool                  spreadCharges,
-                    const real            lambda)
+void pme_gpu_spread(const PmeGpu*                  pmeGpu,
+                    GpuEventSynchronizer*          xReadyOnDevice,
+                    real**                         h_grids,
+                    bool                           computeSplines,
+                    bool                           spreadCharges,
+                    const real                     lambda,
+                    const bool                     useGpuDirectComm,
+                    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
 {
     GMX_ASSERT(
             pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
@@ -1350,6 +1356,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
 
     PmeStage                           timingId;
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    const bool writeGlobalOrSaveSplines          = writeGlobal || (!recalculateSplines);
     if (computeSplines)
     {
         if (spreadCharges)
@@ -1357,7 +1364,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
             timingId  = PmeStage::SplineAndSpread;
             kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
                                                        pmeGpu->settings.threadsPerAtom,
-                                                       writeGlobal || (!recalculateSplines),
+                                                       writeGlobalOrSaveSplines,
                                                        pmeGpu->common->ngrids);
         }
         else
@@ -1365,43 +1372,116 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
             timingId  = PmeStage::Spline;
             kernelPtr = selectSplineKernelPtr(pmeGpu,
                                               pmeGpu->settings.threadsPerAtom,
-                                              writeGlobal || (!recalculateSplines),
+                                              writeGlobalOrSaveSplines,
                                               pmeGpu->common->ngrids);
         }
     }
     else
     {
         timingId  = PmeStage::Spread;
-        kernelPtr = selectSpreadKernelPtr(pmeGpu,
-                                          pmeGpu->settings.threadsPerAtom,
-                                          writeGlobal || (!recalculateSplines),
-                                          pmeGpu->common->ngrids);
+        kernelPtr = selectSpreadKernelPtr(
+                pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
     }
 
 
     pme_gpu_start_timing(pmeGpu, timingId);
     auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+    kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+                                   && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+                                   && !writeGlobalOrSaveSplines;
+    if (kernelParamsPtr->usePipeline)
+    {
+        int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+        for (int i = 0; i < numStagesInPipeline; i++)
+        {
+            int senderRank;
+            if (useGpuDirectComm)
+            {
+                senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+                        i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            }
+            else
+            {
+                senderRank = i;
+            }
+
+            // set kernel configuration options specific to this stage of the pipeline
+            std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+                    pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+            const int blockCount       = static_cast<int>(std::ceil(
+                    static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+                    / atomsPerBlock));
+            auto      dimGrid          = pmeGpuCreateGrid(pmeGpu, blockCount);
+            config.gridSize[0]         = dimGrid.first;
+            config.gridSize[1]         = dimGrid.second;
+            DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
 #if c_canEmbedBuffers
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+            const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
 #else
-    const auto kernelArgs =
-            prepareGpuKernelArguments(kernelPtr,
-                                      config,
-                                      kernelParamsPtr,
-                                      &kernelParamsPtr->atoms.d_theta,
-                                      &kernelParamsPtr->atoms.d_dtheta,
-                                      &kernelParamsPtr->atoms.d_gridlineIndices,
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
-                                      &kernelParamsPtr->grid.d_fractShiftsTable,
-                                      &kernelParamsPtr->grid.d_gridlineIndicesTable,
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
-                                      &kernelParamsPtr->atoms.d_coordinates);
+            const auto kernelArgs =
+                    prepareGpuKernelArguments(kernelPtr,
+                                              config,
+                                              kernelParamsPtr,
+                                              &kernelParamsPtr->atoms.d_theta,
+                                              &kernelParamsPtr->atoms.d_dtheta,
+                                              &kernelParamsPtr->atoms.d_gridlineIndices,
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                              &kernelParamsPtr->grid.d_fractShiftsTable,
+                                              &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                              &kernelParamsPtr->atoms.d_coordinates);
 #endif
 
-    launchGpuKernel(
-            kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+            launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+        }
+        // Set dependencies for PME stream on all pipeline streams
+        for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+        {
+            GpuEventSynchronizer event;
+            event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+        }
+    }
+    else // pipelining is not in use
+    {
+        if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+        {
+            pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+        }
+
+#if c_canEmbedBuffers
+        const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+        const auto kernelArgs =
+                prepareGpuKernelArguments(kernelPtr,
+                                          config,
+                                          kernelParamsPtr,
+                                          &kernelParamsPtr->atoms.d_theta,
+                                          &kernelParamsPtr->atoms.d_dtheta,
+                                          &kernelParamsPtr->atoms.d_gridlineIndices,
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                          &kernelParamsPtr->grid.d_fractShiftsTable,
+                                          &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                          &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+        launchGpuKernel(kernelPtr,
+                        config,
+                        pmeGpu->archSpecific->pmeStream_,
+                        timingEvent,
+                        "PME spline/spread",
+                        kernelArgs);
+    }
+
     pme_gpu_stop_timing(pmeGpu, timingId);
 
     const auto& settings    = pmeGpu->settings;
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index 7baa6bd347..0a6ee2a5d2 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -340,21 +340,30 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
 /*! \libinternal \brief
  * A GPU spline computation and charge spreading function.
  *
- * \param[in]  pmeGpu                 The PME GPU structure.
- * \param[in]  xReadyOnDevice         Event synchronizer indicating that the coordinates are ready in the device memory;
- *                                    can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[out] h_grids                The host-side grid buffers (used only if the result of the spread is expected on the host,
- *                                    e.g. testing or host-side FFT)
- * \param[in]  computeSplines         Should the computation of spline parameters and gridline indices be performed.
- * \param[in]  spreadCharges          Should the charges/coefficients be spread on the grid.
- * \param[in]  lambda                 The lambda value of the current system state.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu*         GPU_FUNC_ARGUMENT(pmeGpu),
-                                       GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                       float**               GPU_FUNC_ARGUMENT(h_grids),
-                                       bool                  GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges),
-                                       real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+ * \param[in]  pmeGpu                    The PME GPU structure.
+ * \param[in]  xReadyOnDevice            Event synchronizer indicating that the coordinates are
+ *                                       ready in the device memory; can be nullptr when invoked
+ *                                       on a separate PME rank or from PME tests.
+ * \param[out] h_grids                   The host-side grid buffers (used only if the result
+ *                                       of the spread is expected on the host, e.g. testing
+ *                                       or host-side FFT)
+ * \param[in]  computeSplines            Should the computation of spline parameters and gridline
+ *                                       indices be performed.
+ * \param[in]  spreadCharges             Should the charges/coefficients be spread on the grid.
+ * \param[in]  lambda                    The lambda value of the current system state.
+ * \param[in]  useGpuDirectComm          Whether direct GPU PME-PP communication is active
+ * \param[in]  pmeCoordinateReceiverGpu  Coordinate receiver object, which must be valid when
+ *                                       direct GPU PME-PP communication is active
+ */
+GPU_FUNC_QUALIFIER void
+pme_gpu_spread(const PmeGpu*                  GPU_FUNC_ARGUMENT(pmeGpu),
+               GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
+               float**                        GPU_FUNC_ARGUMENT(h_grids),
+               bool                           GPU_FUNC_ARGUMENT(computeSplines),
+               bool                           GPU_FUNC_ARGUMENT(spreadCharges),
+               real                           GPU_FUNC_ARGUMENT(lambda),
+               const bool                     GPU_FUNC_ARGUMENT(useGpuDirectComm),
+               gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
  * 3D FFT R2C/C2R routine.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h
index e2c067390a..76e111e765 100644
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -224,6 +224,15 @@ struct PmeGpuKernelParamsBase
      * before launching spreading.
      */
     struct PmeGpuDynamicParams current;
+
+    /*! \brief Whether pipelining with PP communications is active
+     * char rather than bool to avoid problem with OpenCL compiler */
+    char usePipeline;
+    /*! \brief Start atom for this stage of pipeline */
+    int pipelineAtomStart;
+    /*! \brief End atom for this stage of pipeline */
+    int pipelineAtomEnd;
+
     /* These texture objects are only used in CUDA and are related to the grid size. */
     /*! \brief Texture object for accessing grid.d_fractShiftsTable */
     HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 3151ec1ca7..56002966be 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -218,6 +218,9 @@ static gmx_pme_t* gmx_pmeonly_switch(std::vector<gmx_pme_t*>* pmedata,
 }
 
 /*! \brief Called by PME-only ranks to receive coefficients and coordinates
+ *
+ * Note that with GPU direct communication the transfer is only initiated, it is the responsibility
+ * of the caller to synchronize prior to launching spread.
  *
  * \param[in] pme                     PME data structure.
  * \param[in,out] pme_pp              PME-PP communication structure.
@@ -438,9 +441,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                                "GPU Direct PME-PP communication has been enabled, "
                                "but PME run mode is not PmeRunMode::GPU\n");
 
-                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
-                    pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
-                            stateGpu->getCoordinates());
+                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers.
+                    pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates());
                     pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
                 }
             }
@@ -495,11 +497,6 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                 }
             }
 
-            if (pme_pp->useGpuDirectComm)
-            {
-                pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
-            }
-
             status = pmerecvqxX;
         }
 
@@ -673,9 +670,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
         if (useGpuPmePpCommunication)
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
-                    pme_pp->mpi_comm_mysim,
-                    pme_pp->ppRanks);
+                    pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu =
                     std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
                                                              pme_pp->mpi_comm_mysim,
@@ -775,7 +770,12 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
             // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
             auto xReadyOnDevice = nullptr;
 
-            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
+            pme_gpu_launch_spread(pme,
+                                  xReadyOnDevice,
+                                  wcycle,
+                                  lambda_q,
+                                  pme_pp->useGpuDirectComm,
+                                  pme_pp->pmeCoordinateReceiverGpu.get());
             pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
             pme_gpu_launch_gather(pme, wcycle, lambda_q);
             output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index d0856602a9..f5ba345181 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -200,7 +200,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     float  atomCharge;
 
     const int blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
-    const int atomIndexOffset = blockIndex * atomsPerBlock;
+    const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart;
 
     /* Thread index w.r.t. block */
     const int threadLocalId =
@@ -225,8 +225,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Charges, required for both spline and spread */
     if (c_useAtomDataPrefetch)
     {
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
-                                                         kernelParams.atoms.d_coefficients[0]);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(
+                sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]);
         __syncthreads();
         atomCharge = sm_coefficients[atomIndexLocal];
     }
@@ -237,7 +237,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
 
     if (computeSplines)
     {
-        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
+        const float3* __restrict__ gm_coordinates =
+                asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]);
         if (c_useAtomDataPrefetch)
         {
             // Coordinates
@@ -274,8 +275,12 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
-                kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+
+        if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+        {
+            spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
+                    kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+        }
     }
     if (numGrids == 2)
     {
@@ -293,8 +298,11 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
         }
         if (spreadCharges)
         {
-            spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
-                    kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+            if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+            {
+                spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
+                        kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+            }
         }
     }
 }
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index a3c4409fca..0016cd8024 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -70,6 +70,7 @@
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
 
 #include "testutils/test_hardware_environment.h"
 #include "testutils/testasserts.h"
@@ -342,7 +343,18 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme,
             const real lambdaQ = 1.0;
             // no synchronization needed as x is transferred in the PME stream
             GpuEventSynchronizer* xReadyOnDevice = nullptr;
-            pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ);
+
+            bool                           useGpuDirectComm         = false;
+            gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+            pme_gpu_spread(pme->gpu,
+                           xReadyOnDevice,
+                           fftgrid,
+                           computeSplines,
+                           spreadCharges,
+                           lambdaQ,
+                           useGpuDirectComm,
+                           pmeCoordinateReceiverGpu);
         }
         break;
 #endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 241cc3b9fc..a2ccaed47b 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -54,6 +54,7 @@
 #include "gromacs/domdec/partition.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
 #include "gromacs/ewald/pme_pp.h"
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
 #include "gromacs/gmxlib/network.h"
@@ -746,7 +747,10 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                       gmx_wallcycle*        wcycle)
 {
     pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+    bool                           useGpuDirectComm         = false;
+    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+    pme_gpu_launch_spread(
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
 }
 
 /*! \brief Launch the FFT and gather stages of PME GPU