Remove MPI comm from GPU PME-PP force transfer initiation

author Alan Gray <alangray3@gmail.com>

Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)
author Alan Gray <alangray3@gmail.com>
Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h

index 64c981ccb6b99033de551990fead0364a1404b94..44933df9cca69212f637e7e700437431d84ca34c 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -94,10 +94,11 @@ public:
  
      /*! \brief
       * Send force to PP rank (used with Thread-MPI)
-     * \param[in] ppRank           PP rank to receive data
-     * \param[in] numAtoms         number of atoms to send
+     * \param[in] ppRank                   PP rank to receive data
+     * \param[in] numAtoms                 number of atoms to send
+     * \param[in] sendForcesDirectToPpGpu  whether forces are transferred direct to remote GPU memory
       */
-    void sendFToPpCudaDirect(int ppRank, int numAtoms);
+    void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
  
      /*! \brief
       * Send force to PP rank (used with Lib-MPI)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp

index 5475c665da675a84398069821b26db42ff3b9c80..1516d72d790040c47b789f8f2ef81e7507bdecda 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -84,7 +84,9 @@ void PmeForceSenderGpu::setForceSendBuffer(DeviceBuffer<RVec> /* d_f */)
                 "correct implementation.");
  }
  
-void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */, int /* numAtoms */)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */,
+                                            int /* numAtoms */,
+                                            bool /* sendForcesDirectToPpGpu */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index 5356683083ebec4ffb31b75e4596b12433a11795..64b3440d2e1bb9f11054a2706934f5de88d126c5 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -59,14 +59,19 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
                                MPI_Comm               comm,
                                const DeviceContext&   deviceContext,
                                gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeForcesReady_(pmeForcesReady), comm_(comm), ppRanks_(ppRanks), deviceContext_(deviceContext)
+    pmeForcesReady_(pmeForcesReady),
+    comm_(comm),
+    ppRanks_(ppRanks),
+    deviceContext_(deviceContext),
+    ppCommStream_(ppRanks.size()),
+    ppCommEvent_(ppRanks.size()),
+    pmeRemoteGpuForcePtr_(ppRanks.size()),
+    pmeRemoteCpuForcePtr_(ppRanks.size())
  {
      // Create streams and events to manage pushing of force buffers to remote PP ranks
      std::unique_ptr<DeviceStream>         stream;
      std::unique_ptr<GpuEventSynchronizer> event;
      size_t                                i = 0;
-    ppCommStream_.resize(ppRanks_.size());
-    ppCommEvent_.resize(ppRanks_.size());
      for (i = 0; i < ppRanks_.size(); i++)
      {
          stream = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false);
@@ -103,8 +108,14 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
          ind_start = ind_end;
          ind_end   = ind_start + receiver.numAtoms;
  
-        localForcePtr_[i++] = &d_f[ind_start];
+        localForcePtr_[i] = &d_f[ind_start];
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        i++;
      }
+
  #else
      GMX_UNUSED_VALUE(d_f);
  #endif
@@ -112,16 +123,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
  
  
  /*! \brief Send PME synchronizer directly using CUDA memory copy */
-void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
  {
  
      GMX_ASSERT(GMX_THREAD_MPI, "sendFToPpCudaDirect is expected to be called only for Thread-MPI");
  
  
  #if GMX_MPI
-    void* pmeRemoteForcePtr;
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Recv(&pmeRemoteForcePtr, sizeof(void*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE);
+    float3* pmeRemoteForcePtr =
+            sendForcesDirectToPpGpu ? pmeRemoteGpuForcePtr_[ppRank] : pmeRemoteCpuForcePtr_[ppRank];
  
      pmeForcesReady_->enqueueWaitEvent(*ppCommStream_[ppRank]);
  
@@ -190,9 +200,9 @@ void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
      impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
  }
  
-void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
  {
-    impl_->sendFToPpCudaDirect(ppRank, numAtoms);
+    impl_->sendFToPpCudaDirect(ppRank, numAtoms, sendForcesDirectToPpGpu);
  }
  
  
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 74954bdf1fc7260c4463cd6e153b227b18006982..5575f03b071e5e888ceb441bb3f6bb787987957e 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -80,10 +80,11 @@ public:
  
      /*! \brief
       * Send force to PP rank (used with Thread-MPI)
-     * \param[in] ppRank           PP rank to receive data
-     * \param[in] numAtoms         number of atoms to send
+     * \param[in] ppRank                   PP rank to receive data
+     * \param[in] numAtoms                 number of atoms to send
+     * \param[in] sendForcesDirectToPpGpu  whether forces are transferred direct to remote GPU memory
       */
-    void sendFToPpCudaDirect(int ppRank, int numAtoms);
+    void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
  
      /*! \brief
       * Send force to PP rank (used with Lib-MPI)
@@ -110,6 +111,10 @@ private:
      std::vector<DeviceBuffer<RVec>> localForcePtr_;
      //! GPU context handle (not used in CUDA)
      const DeviceContext& deviceContext_;
+    //! Vector of CPU force buffer pointers for multiple remote PP tasks
+    std::vector<float3*> pmeRemoteCpuForcePtr_;
+    //! Vector of GPU force buffer pointers for multiple remote PP tasks
+    std::vector<float3*> pmeRemoteGpuForcePtr_;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 50f5b48878452768e338c7739785edc9052b5539..0e67991867eb750d8bb499a27b24887d84236b33 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -134,6 +134,8 @@ struct gmx_pme_pp
  
      /*! \brief whether GPU direct communications are active for PME-PP transfers */
      bool useGpuDirectComm = false;
+    /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */
+    bool sendForcesDirectToPpGpu = false;
  };
  
  /*! \brief Initialize the PME-only side of the PME <-> PP communication */
@@ -288,6 +290,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
          GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr),
                     "The use of GPU direct communication for PME-PP is enabled, "
                     "but the PME GPU force reciever object does not exist");
+        pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0);
  
          if (cnb.flags & PP_PME_FINISH)
          {
@@ -558,7 +561,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
  
              if (GMX_THREAD_MPI)
              {
-                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId, receiver.numAtoms);
+                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                        receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
              }
              else
              {
diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp

index e8433399a2235f99944df5b0e555d002042934a0..3061b0afcee8e3380a2c6cbb784bea62d784a40d 100644 (file)
--- a/src/gromacs/ewald/pme_pp.cpp
+++ b/src/gromacs/ewald/pme_pp.cpp
@@ -113,6 +113,7 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
                                         bool                           useGpuPmePpComms,
                                         bool                           reinitGpuPmePpComms,
                                         bool                           sendCoordinatesFromGpu,
+                                       bool                           receiveForcesToGpu,
                                         GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent)
  {
      gmx_domdec_t*         dd;
@@ -138,6 +139,10 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
      if (useGpuPmePpComms)
      {
          flags |= PP_PME_GPUCOMMS;
+        if (receiveForcesToGpu)
+        {
+            flags |= PP_PME_RECVFTOGPU;
+        }
      }
  
      if (c_useDelayedWait)
@@ -257,6 +262,8 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
          {
              if (reinitGpuPmePpComms)
              {
+                std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
+                buffer.resize(n);
                  fr->pmePpCommGpu->reinit(n);
              }
  
@@ -357,6 +364,7 @@ void gmx_pme_send_parameters(const t_commrec*           cr,
                                 false,
                                 false,
                                 false,
+                               false,
                                 nullptr);
  }
  
@@ -371,6 +379,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                                bool                           useGpuPmePpComms,
                                bool                           receiveCoordinateAddressFromPme,
                                bool                           sendCoordinatesFromGpu,
+                              bool                           receiveForcesToGpu,
                                GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
                                gmx_wallcycle*                 wcycle)
  {
@@ -400,6 +409,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                                 useGpuPmePpComms,
                                 receiveCoordinateAddressFromPme,
                                 sendCoordinatesFromGpu,
+                               receiveForcesToGpu,
                                 coordinatesReadyOnDeviceEvent);
  
      wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
@@ -410,7 +420,7 @@ void gmx_pme_send_finish(const t_commrec* cr)
      unsigned int flags = PP_PME_FINISH;
  
      gmx_pme_send_coeffs_coords(
-            nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, nullptr);
+            nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, false, nullptr);
  }
  
  void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
diff --git a/src/gromacs/ewald/pme_pp.h b/src/gromacs/ewald/pme_pp.h

index 683dfc379d14ce6a688e5e939fa2cce53e9f936c..6c8402311f19e809ef71dc01abdca4908a4727f9 100644 (file)
--- a/src/gromacs/ewald/pme_pp.h
+++ b/src/gromacs/ewald/pme_pp.h
@@ -88,6 +88,7 @@ void gmx_pme_send_coordinates(t_forcerec*                    fr,
                                bool                           useGpuPmePpComms,
                                bool                           reinitGpuPmePpComms,
                                bool                           sendCoordinatesFromGpu,
+                              bool                           receiveForcesToGpu,
                                GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
                                gmx_wallcycle*                 wcycle);
  
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h

index 454da6d659d9252a267a97db5d193e66fbb70061..a11f185a1237ce93460b778d4e11d6ff017e5ebd 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -43,6 +43,7 @@
  #define GMX_PME_PP_COMM_GPU_H
  
  #include <memory>
+#include <vector>
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
@@ -66,12 +67,17 @@ class PmePpCommGpu
  
  public:
      /*! \brief Creates PME-PP GPU communication object
-     * \param[in] comm            Communicator used for simulation
-     * \param[in] pmeRank         Rank of PME task
-     * \param[in] deviceContext   GPU context.
-     * \param[in] deviceStream    GPU stream.
+     * \param[in] comm              Communicator used for simulation
+     * \param[in] pmeRank           Rank of PME task
+     * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+     * \param[in] deviceContext     GPU context.
+     * \param[in] deviceStream      GPU stream.
       */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    PmePpCommGpu(MPI_Comm                comm,
+                 int                     pmeRank,
+                 std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                 const DeviceContext&    deviceContext,
+                 const DeviceStream&     deviceStream);
      ~PmePpCommGpu();
  
      /*! \brief Perform steps required when buffer size changes
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp

index e7339f9c7ea49f03821aa1653ada6ecd6ec9df6e..0a5c60a4dd452af9d4b001cfbb64c236f6d946ee 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -64,6 +64,7 @@ class PmePpCommGpu::Impl
  /*!\brief Constructor stub. */
  PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
                             int /* pmeRank */,
+                           std::vector<gmx::RVec>& /* pmeCpuForceBuffer */,
                             const DeviceContext& /* deviceContext */,
                             const DeviceStream& /* deviceStream */) :
      impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 8acb6aa671679978a0d6379f0ddba05056dd0299..50e9a4189fc82426205a0405cbbddfacd67f86ce 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -59,14 +59,16 @@
  namespace gmx
  {
  
-PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
-                         int                  pmeRank,
-                         const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream) :
+PmePpCommGpu::Impl::Impl(MPI_Comm                comm,
+                         int                     pmeRank,
+                         std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                         const DeviceContext&    deviceContext,
+                         const DeviceStream&     deviceStream) :
      deviceContext_(deviceContext),
      pmePpCommStream_(deviceStream),
      comm_(comm),
      pmeRank_(pmeRank),
+    pmeCpuForceBuffer_(pmeCpuForceBuffer),
      d_pmeForces_(nullptr)
  {
  }
@@ -75,36 +77,32 @@ PmePpCommGpu::Impl::~Impl() = default;
  
  void PmePpCommGpu::Impl::reinit(int size)
  {
+    // Reallocate device buffer used for staging PME force
+    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
+
      // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
  #if GMX_MPI
  
      if (GMX_THREAD_MPI)
      {
-        // receive device buffer address from PME rank
+        // receive device coordinate buffer address from PME rank
          MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        // send host and device force buffer addresses to PME rank
+        MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
+        RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_.data();
+        MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
      }
  
  #endif
-
-    // Reallocate buffer used for staging PME force on GPU
-    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
  }
  
  // TODO make this asynchronous by splitting into this into
  // launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
  {
  #if GMX_MPI
      // Remote PME task pushes GPU data directly data to this PP task.
  
-    void* localForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
-
-    // Send destination pointer to PME task. Do this every step since
-    // PME task is agostic as to whether destination is PP CPU or
-    // GPU.
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Send(&localForcePtr, sizeof(void*), MPI_BYTE, pmeRank_, 0, comm_);
-
      // Recieve event from PME task after PME->PP force data push has
      // been scheduled and enqueue this to PP stream.
      GpuEventSynchronizer* eventptr;
@@ -143,7 +141,7 @@ void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool
      float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
      if (GMX_THREAD_MPI)
      {
-        receiveForceFromPmeCudaDirect(pmeForcePtr, receivePmeForceToGpu);
+        receiveForceFromPmeCudaDirect(receivePmeForceToGpu);
      }
      else
      {
@@ -221,11 +219,12 @@ GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
      }
  }
  
-PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
-                           int                  pmeRank,
-                           const DeviceContext& deviceContext,
-                           const DeviceStream&  deviceStream) :
-    impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm                comm,
+                           int                     pmeRank,
+                           std::vector<gmx::RVec>& pmeCpuForceBuffer,
+                           const DeviceContext&    deviceContext,
+                           const DeviceStream&     deviceStream) :
+    impl_(new Impl(comm, pmeRank, pmeCpuForceBuffer, deviceContext, deviceStream))
  {
  }
  
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 3d3039db6d04f1b3c267314bba0ad260338b0d4a..f62faea93374de33c2501d83d868d2fe8bd06f88 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -58,12 +58,17 @@ class PmePpCommGpu::Impl
  public:
      /*! \brief Creates PME-PP GPU communication object.
       *
-     * \param[in] comm            Communicator used for simulation
-     * \param[in] pmeRank         Rank of PME task
-     * \param[in] deviceContext   GPU context.
-     * \param[in] deviceStream    GPU stream.
+     * \param[in] comm              Communicator used for simulation
+     * \param[in] pmeRank           Rank of PME task
+     * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+     * \param[in] deviceContext     GPU context.
+     * \param[in] deviceStream      GPU stream.
       */
-    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    Impl(MPI_Comm                comm,
+         int                     pmeRank,
+         std::vector<gmx::RVec>& pmeCpuForceBuffer,
+         const DeviceContext&    deviceContext,
+         const DeviceStream&     deviceStream);
      ~Impl();
  
      /*! \brief Perform steps required when buffer size changes
@@ -115,10 +120,9 @@ private:
      /*! \brief Pull force buffer directly from GPU memory on PME
       * rank to either GPU or CPU memory on PP task using CUDA
       * Memory copy. This method is used with Thread-MPI.
-     * \param[out] recvPtr CPU buffer to receive PME force data
       * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
       */
-    void receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu);
+    void receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu);
  
      /*! \brief Pull force buffer directly from GPU memory on PME
       * rank to either GPU or CPU memory on PP task using CUDA-aware
@@ -160,6 +164,8 @@ private:
      MPI_Comm comm_;
      //! Rank of PME task
      int pmeRank_ = -1;
+    //! Buffer for PME force on CPU
+    std::vector<gmx::RVec>& pmeCpuForceBuffer_;
      //! Buffer for staging PME force on GPU
      DeviceBuffer<gmx::RVec> d_pmeForces_;
      //! number of atoms in PME force staging array
diff --git a/src/gromacs/ewald/pme_pp_communication.h b/src/gromacs/ewald/pme_pp_communication.h

index d5fbd960d93fc30afb0b3b9241d3b98a0e5ef602..184ae94865b24819352ef78226911b1443b868a4 100644 (file)
--- a/src/gromacs/ewald/pme_pp_communication.h
+++ b/src/gromacs/ewald/pme_pp_communication.h
@@ -85,6 +85,8 @@ enum
  #define PP_PME_SWITCHGRID (1 << 11)
  #define PP_PME_RESETCOUNTERS (1 << 12)
  #define PP_PME_GPUCOMMS (1 << 13)
+// Whether PME forces are transferred directly to remote PP GPU memory in a specific step
+#define PP_PME_RECVFTOGPU (1 << 14)
  //@}
  
  /*! \brief Return values for gmx_pme_recv_q_x */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 4210f696cda3492b55c55d2e60593c4e5f224dd5..563a36239e660fed2d74a06f18c50cbd8bc6c6f3 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1374,6 +1374,7 @@ void do_force(FILE*                               fplog,
                                   simulationWork.useGpuPmePpCommunication,
                                   reinitGpuPmePpComms,
                                   pmeSendCoordinatesFromGpu,
+                                 stepWork.useGpuPmeFReduction,
                                   localXReadyOnDevice,
                                   wcycle);
      }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 9da08860294abbb1d4fe29cc79d980fd9c877a48..42bab51cb8f428c557e2b3ff2c6e306fd0106037 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1649,6 +1649,7 @@ int Mdrunner::mdrunner()
              fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
                      cr->mpi_comm_mysim,
                      cr->dd->pme_nodeid,
+                    cr->dd->pmeForceReceiveBuffer,
                      deviceStreamManager->context(),
                      deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
          }
author	Alan Gray <alangray3@gmail.com>
	Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sun, 1 Aug 2021 08:24:19 +0000 (08:24 +0000)
src/gromacs/ewald/pme_force_sender_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_communication.h		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history