From: Szilárd Páll <pall.szilard@gmail.com>
Date: Fri, 5 Mar 2021 19:29:38 +0000 (+0100)
Subject: Use existing PME f ready event in PmeForceSenderGpu
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=39b9e167ce6a9a0eeaca8460bcbce3d3e8202c4b;p=alexxy%2Fgromacs.git

Use existing PME f ready event in PmeForceSenderGpu

Instead of recording internally into the PME stream and sending that
event to the PP rank to sycn on from the separate PME rank, use the
already existing event recorded in PME.
This also eliminates the unnecessary use of multiple events, one for
each PP rank.

Refs #2891 #2915
---

diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h
index bcc3b1e393..081ba454e6 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -47,7 +47,7 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
-class DeviceStream;
+class GpuEventSynchronizer;
 
 /*! \libinternal
  * \brief Contains information about the PP ranks that partner this PME rank. */
@@ -72,11 +72,11 @@ class PmeForceSenderGpu
 
 public:
     /*! \brief Creates PME GPU Force sender object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     * \param[in] pmeForcesReady  Event synchronizer marked when PME forces are ready on the GPU
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~PmeForceSenderGpu();
 
     /*! \brief
@@ -86,7 +86,7 @@ public:
     void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
-     * Send PP data to PP rank
+     * Send force synchronizer to PP rank
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFToPpCudaDirect(int ppRank);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
index 915d0953ed..8d8b97f5c5 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -62,7 +62,7 @@ class PmeForceSenderGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
+PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* /*pmeForcesReady */,
                                      MPI_Comm /* comm     */,
                                      gmx::ArrayRef<PpRanks> /* ppRanks */) :
     impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index 07d37dcd7e..44a2e30de3 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -55,8 +55,10 @@ namespace gmx
 {
 
 /*! \brief Create PME-PP GPU communication object */
-PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(pmeStream),
+PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
+                              MPI_Comm               comm,
+                              gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeForcesReady_(pmeForcesReady),
     comm_(comm),
     ppRanks_(ppRanks)
 {
@@ -88,28 +90,24 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
     }
 }
 
-/*! \brief Send PME data directly using CUDA memory copy */
+/*! \brief Send PME synchronizer directly using CUDA memory copy */
 void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank)
 {
     // Data will be pulled directly from PP task
-
-    // Record and send event to ensure PME force calcs are completed before PP task pulls data
-    pmeSync_.markEvent(pmeStream_);
-    GpuEventSynchronizer* pmeSyncPtr = &pmeSync_;
 #if GMX_MPI
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
-    MPI_Send(&pmeSyncPtr, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+    MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
 #else
     GMX_UNUSED_VALUE(pmeSyncPtr);
     GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
-PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
+PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
                                      MPI_Comm               comm,
                                      gmx::ArrayRef<PpRanks> ppRanks) :
-    impl_(new Impl(pmeStream, comm, ppRanks))
+    impl_(new Impl(pmeForcesReady, comm, ppRanks))
 {
 }
 
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index 70be40cc7f..ad9718c468 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -57,11 +57,11 @@ class PmeForceSenderGpu::Impl
 
 public:
     /*! \brief Creates PME GPU Force sender object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     * \param[in] pmeForcesReady  Event synchronizer marked when PME forces are ready on the GPU
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~Impl();
 
     /*! \brief
@@ -71,16 +71,14 @@ public:
     void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
-     * Send PP data to PP rank
+     * Send force synchronizer to PP rank
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFToPpCudaDirect(int ppRank);
 
 private:
-    //! CUDA stream for PME operations
-    const DeviceStream& pmeStream_;
-    //! Event triggered when to allow remote PP stream to syn with pme stream
-    GpuEventSynchronizer pmeSync_;
+    //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
+    GpuEventSynchronizer* pmeForcesReady_;
     //! communicator for simulation
     MPI_Comm comm_;
     //! list of PP ranks
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 5ef702e442..e057495712 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -648,9 +648,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                     pme_pp->mpi_comm_mysim,
                     pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
-                    pme_pp->mpi_comm_mysim,
-                    pme_pp->ppRanks);
+                    pme_gpu_get_f_ready_synchronizer(pme), pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
         }
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.