From: Alan Gray <alangray3@gmail.com>
Date: Sun, 1 Aug 2021 08:24:19 +0000 (+0000)
Subject: Remove MPI comm from GPU PME-PP force transfer initiation
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=4a1ed1360c1e5bb28a9f00c06ce35e88e6ea3cf9;p=alexxy%2Fgromacs.git

Remove MPI comm from GPU PME-PP force transfer initiation
---

diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h
index 64c981ccb6..44933df9cc 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -94,10 +94,11 @@ public:
 
     /*! \brief
      * Send force to PP rank (used with Thread-MPI)
-     * \param[in] ppRank           PP rank to receive data
-     * \param[in] numAtoms         number of atoms to send
+     * \param[in] ppRank                   PP rank to receive data
+     * \param[in] numAtoms                 number of atoms to send
+     * \param[in] sendForcesDirectToPpGpu  whether forces are transferred direct to remote GPU memory
      */
-    void sendFToPpCudaDirect(int ppRank, int numAtoms);
+    void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
 
     /*! \brief
      * Send force to PP rank (used with Lib-MPI)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
index 5475c665da..1516d72d79 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -84,7 +84,9 @@ void PmeForceSenderGpu::setForceSendBuffer(DeviceBuffer<RVec> /* d_f */)
                "correct implementation.");
 }
 
-void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */, int /* numAtoms */)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */,
+                                            int /* numAtoms */,
+                                            bool /* sendForcesDirectToPpGpu */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index 5356683083..64b3440d2e 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -59,14 +59,19 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
                               MPI_Comm               comm,
                               const DeviceContext&   deviceContext,
                               gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeForcesReady_(pmeForcesReady), comm_(comm), ppRanks_(ppRanks), deviceContext_(deviceContext)
+    pmeForcesReady_(pmeForcesReady),
+    comm_(comm),
+    ppRanks_(ppRanks),
+    deviceContext_(deviceContext),
+    ppCommStream_(ppRanks.size()),
+    ppCommEvent_(ppRanks.size()),
+    pmeRemoteGpuForcePtr_(ppRanks.size()),
+    pmeRemoteCpuForcePtr_(ppRanks.size())
 {
     // Create streams and events to manage pushing of force buffers to remote PP ranks
     std::unique_ptr<DeviceStream>         stream;
     std::unique_ptr<GpuEventSynchronizer> event;
     size_t                                i = 0;
-    ppCommStream_.resize(ppRanks_.size());
-    ppCommEvent_.resize(ppRanks_.size());
     for (i = 0; i < ppRanks_.size(); i++)
     {
         stream = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false);
@@ -103,8 +108,14 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
         ind_start = ind_end;
         ind_end   = ind_start + receiver.numAtoms;
 
-        localForcePtr_[i++] = &d_f[ind_start];
+        localForcePtr_[i] = &d_f[ind_start];
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        i++;
     }
+
 #else
     GMX_UNUSED_VALUE(d_f);
 #endif
@@ -112,16 +123,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
 
 
 /*! \brief Send PME synchronizer directly using CUDA memory copy */
-void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
 {
 
     GMX_ASSERT(GMX_THREAD_MPI, "sendFToPpCudaDirect is expected to be called only for Thread-MPI");
 
 
 #if GMX_MPI
-    void* pmeRemoteForcePtr;
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Recv(&pmeRemoteForcePtr, sizeof(void*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE);
+    float3* pmeRemoteForcePtr =
+            sendForcesDirectToPpGpu ? pmeRemoteGpuForcePtr_[ppRank] : pmeRemoteCpuForcePtr_[ppRank];
 
     pmeForcesReady_->enqueueWaitEvent(*ppCommStream_[ppRank]);
 
@@ -190,9 +200,9 @@ void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
     impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
 }
 
-void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
 {
-    impl_->sendFToPpCudaDirect(ppRank, numAtoms);
+    impl_->sendFToPpCudaDirect(ppRank, numAtoms, sendForcesDirectToPpGpu);
 }
 
 
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index 74954bdf1f..5575f03b07 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -80,10 +80,11 @@ public:
 
     /*! \brief
      * Send force to PP rank (used with Thread-MPI)
-     * \param[in] ppRank           PP rank to receive data
-     * \param[in] numAtoms         number of atoms to send
+     * \param[in] ppRank                   PP rank to receive data
+     * \param[in] numAtoms                 number of atoms to send
+     * \param[in] sendForcesDirectToPpGpu  whether forces are transferred direct to remote GPU memory
      */
-    void sendFToPpCudaDirect(int ppRank, int numAtoms);
+    void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
 
     /*! \brief
      * Send force to PP rank (used with Lib-MPI)
@@ -110,6 +111,10 @@ private:
     std::vector<DeviceBuffer<RVec>> localForcePtr_;
     //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
+    //! Vector of CPU force buffer pointers for multiple remote PP tasks
+    std::vector<float3*> pmeRemoteCpuForcePtr_;
+    //! Vector of GPU force buffer pointers for multiple remote PP tasks
+    std::vector<float3*> pmeRemoteGpuForcePtr_;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 50f5b48878..0e67991867 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -134,6 +134,8 @@ struct gmx_pme_pp
 
     /*! \brief whether GPU direct communications are active for PME-PP transfers */
     bool useGpuDirectComm = false;
+    /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */
+    bool sendForcesDirectToPpGpu = false;
 };
 
 /*! \brief Initialize the PME-only side of the PME <-> PP communication */
@@ -288,6 +290,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
         GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr),
                    "The use of GPU direct communication for PME-PP is enabled, "
                    "but the PME GPU force reciever object does not exist");
+        pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0);
 
         if (cnb.flags & PP_PME_FINISH)
         {
@@ -558,7 +561,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
 
             if (GMX_THREAD_MPI)
             {
-                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId, receiver.numAtoms);
+                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                        receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
             }
             else
             {
diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp
index e8433399a2..3061b0afce 100644
--- a/src/gromacs/ewald/pme_pp.cpp
+++ b/src/gromacs/ewald/pme_pp.cpp
@@ -113,6 +113,7 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
                                        bool                           useGpuPmePpComms,
                                        bool                           reinitGpuPmePpComms,
                                        bool                           sendCoordinatesFromGpu,
+                                       bool                           receiveForcesToGpu,
                                        GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent)
 {
     gmx_domdec_t*         dd;
@@ -138,6 +139,10 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
     if (useGpuPmePpComms)
     {
         flags |= PP_PME_GPUCOMMS;
+        if (receiveForcesToGpu)
+        {
+            flags |= PP_PME_RECVFTOGPU;
+        }
     }
 
     if (c_useDelayedWait)
@@ -257,6 +262,8 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
         {
             if (reinitGpuPmePpComms)
             {
+                std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
+                buffer.resize(n);
                 fr->pmePpCommGpu->reinit(n);
             }
 
@@ -357,6 +364,7 @@ void gmx_pme_send_parameters(const t_commrec*           cr,
                                false,
                                false,
                                false,
+                               false,
                                nullptr);
 }
 
@@ -371,6 +379,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                               bool                           useGpuPmePpComms,
                               bool                           receiveCoordinateAddressFromPme,
                               bool                           sendCoordinatesFromGpu,
+                              bool                           receiveForcesToGpu,
                               GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
                               gmx_wallcycle*                 wcycle)
 {
@@ -400,6 +409,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                                useGpuPmePpComms,
                                receiveCoordinateAddressFromPme,
                                sendCoordinatesFromGpu,
+                               receiveForcesToGpu,
                                coordinatesReadyOnDeviceEvent);
 
     wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
@@ -410,7 +420,7 @@ void gmx_pme_send_finish(const t_commrec* cr)
     unsigned int flags = PP_PME_FINISH;
 
     gmx_pme_send_coeffs_coords(
-            nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, nullptr);
+            nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, false, nullptr);
 }
 
 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
diff --git a/src/gromacs/ewald/pme_pp.h b/src/gromacs/ewald/pme_pp.h
index 683dfc379d..6c8402311f 100644
--- a/src/gromacs/ewald/pme_pp.h
+++ b/src/gromacs/ewald/pme_pp.h
@@ -88,6 +88,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                               bool                           useGpuPmePpComms,
                               bool                           reinitGpuPmePpComms,
                               bool                           sendCoordinatesFromGpu,
+                              bool                           receiveForcesToGpu,
                               GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
                               gmx_wallcycle*                 wcycle);
 
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h
index 454da6d659..a11f185a12 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -43,6 +43,7 @@
 #define GMX_PME_PP_COMM_GPU_H
 
 #include <memory>
+#include <vector>
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
@@ -66,12 +67,17 @@ class PmePpCommGpu
 
 public:
     /*! \brief Creates PME-PP GPU communication object
-     * \param[in] comm            Communicator used for simulation
-     * \param[in] pmeRank         Rank of PME task
-     * \param[in] deviceContext   GPU context.
-     * \param[in] deviceStream    GPU stream.
+     * \param[in] comm              Communicator used for simulation
+     * \param[in] pmeRank           Rank of PME task
+     * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+     * \param[in] deviceContext     GPU context.
+     * \param[in] deviceStream      GPU stream.
      */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    PmePpCommGpu(MPI_Comm                comm,
+                 int                     pmeRank,
+                 std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                 const DeviceContext&    deviceContext,
+                 const DeviceStream&     deviceStream);
     ~PmePpCommGpu();
 
     /*! \brief Perform steps required when buffer size changes
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
index e7339f9c7e..0a5c60a4dd 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -64,6 +64,7 @@ class PmePpCommGpu::Impl
 /*!\brief Constructor stub. */
 PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
                            int /* pmeRank */,
+                           std::vector<gmx::RVec>& /* pmeCpuForceBuffer */,
                            const DeviceContext& /* deviceContext */,
                            const DeviceStream& /* deviceStream */) :
     impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 8acb6aa671..50e9a4189f 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -59,14 +59,16 @@
 namespace gmx
 {
 
-PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
-                         int                  pmeRank,
-                         const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream) :
+PmePpCommGpu::Impl::Impl(MPI_Comm                comm,
+                         int                     pmeRank,
+                         std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                         const DeviceContext&    deviceContext,
+                         const DeviceStream&     deviceStream) :
     deviceContext_(deviceContext),
     pmePpCommStream_(deviceStream),
     comm_(comm),
     pmeRank_(pmeRank),
+    pmeCpuForceBuffer_(pmeCpuForceBuffer),
     d_pmeForces_(nullptr)
 {
 }
@@ -75,36 +77,32 @@ PmePpCommGpu::Impl::~Impl() = default;
 
 void PmePpCommGpu::Impl::reinit(int size)
 {
+    // Reallocate device buffer used for staging PME force
+    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
+
     // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
 #if GMX_MPI
 
     if (GMX_THREAD_MPI)
     {
-        // receive device buffer address from PME rank
+        // receive device coordinate buffer address from PME rank
         MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        // send host and device force buffer addresses to PME rank
+        MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
+        RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_.data();
+        MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
     }
 
 #endif
-
-    // Reallocate buffer used for staging PME force on GPU
-    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
 }
 
 // TODO make this asynchronous by splitting into this into
 // launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
 {
 #if GMX_MPI
     // Remote PME task pushes GPU data directly data to this PP task.
 
-    void* localForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
-
-    // Send destination pointer to PME task. Do this every step since
-    // PME task is agostic as to whether destination is PP CPU or
-    // GPU.
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Send(&localForcePtr, sizeof(void*), MPI_BYTE, pmeRank_, 0, comm_);
-
     // Recieve event from PME task after PME->PP force data push has
     // been scheduled and enqueue this to PP stream.
     GpuEventSynchronizer* eventptr;
@@ -143,7 +141,7 @@ void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool
     float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
     if (GMX_THREAD_MPI)
     {
-        receiveForceFromPmeCudaDirect(pmeForcePtr, receivePmeForceToGpu);
+        receiveForceFromPmeCudaDirect(receivePmeForceToGpu);
     }
     else
     {
@@ -221,11 +219,12 @@ GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
     }
 }
 
-PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
-                           int                  pmeRank,
-                           const DeviceContext& deviceContext,
-                           const DeviceStream&  deviceStream) :
-    impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm                comm,
+                           int                     pmeRank,
+                           std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                           const DeviceContext&    deviceContext,
+                           const DeviceStream&     deviceStream) :
+    impl_(new Impl(comm, pmeRank, pmeCpuForceBuffer, deviceContext, deviceStream))
 {
 }
 
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 3d3039db6d..f62faea933 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -58,12 +58,17 @@ class PmePpCommGpu::Impl
 public:
     /*! \brief Creates PME-PP GPU communication object.
      *
-     * \param[in] comm            Communicator used for simulation
-     * \param[in] pmeRank         Rank of PME task
-     * \param[in] deviceContext   GPU context.
-     * \param[in] deviceStream    GPU stream.
+     * \param[in] comm              Communicator used for simulation
+     * \param[in] pmeRank           Rank of PME task
+     * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+     * \param[in] deviceContext     GPU context.
+     * \param[in] deviceStream      GPU stream.
      */
-    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    Impl(MPI_Comm                comm,
+         int                     pmeRank,
+         std::vector<gmx::RVec>& pmeCpuForceBuffer,
+         const DeviceContext&    deviceContext,
+         const DeviceStream&     deviceStream);
     ~Impl();
 
     /*! \brief Perform steps required when buffer size changes
@@ -115,10 +120,9 @@ private:
     /*! \brief Pull force buffer directly from GPU memory on PME
      * rank to either GPU or CPU memory on PP task using CUDA
      * Memory copy. This method is used with Thread-MPI.
-     * \param[out] recvPtr CPU buffer to receive PME force data
      * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
      */
-    void receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu);
+    void receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu);
 
     /*! \brief Pull force buffer directly from GPU memory on PME
      * rank to either GPU or CPU memory on PP task using CUDA-aware
@@ -160,6 +164,8 @@ private:
     MPI_Comm comm_;
     //! Rank of PME task
     int pmeRank_ = -1;
+    //! Buffer for PME force on CPU
+    std::vector<gmx::RVec>& pmeCpuForceBuffer_;
     //! Buffer for staging PME force on GPU
     DeviceBuffer<gmx::RVec> d_pmeForces_;
     //! number of atoms in PME force staging array
diff --git a/src/gromacs/ewald/pme_pp_communication.h b/src/gromacs/ewald/pme_pp_communication.h
index d5fbd960d9..184ae94865 100644
--- a/src/gromacs/ewald/pme_pp_communication.h
+++ b/src/gromacs/ewald/pme_pp_communication.h
@@ -85,6 +85,8 @@ enum
 #define PP_PME_SWITCHGRID (1 << 11)
 #define PP_PME_RESETCOUNTERS (1 << 12)
 #define PP_PME_GPUCOMMS (1 << 13)
+// Whether PME forces are transferred directly to remote PP GPU memory in a specific step
+#define PP_PME_RECVFTOGPU (1 << 14)
 //@}
 
 /*! \brief Return values for gmx_pme_recv_q_x */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 4210f696cd..563a36239e 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1374,6 +1374,7 @@ void do_force(FILE*                               fplog,
                                  simulationWork.useGpuPmePpCommunication,
                                  reinitGpuPmePpComms,
                                  pmeSendCoordinatesFromGpu,
+                                 stepWork.useGpuPmeFReduction,
                                  localXReadyOnDevice,
                                  wcycle);
     }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 9da0886029..42bab51cb8 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1649,6 +1649,7 @@ int Mdrunner::mdrunner()
             fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
                     cr->mpi_comm_mysim,
                     cr->dd->pme_nodeid,
+                    cr->dd->pmeForceReceiveBuffer,
                     deviceStreamManager->context(),
                     deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
         }