Use existing PME f ready event in PmeForceSenderGpu
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 5 Mar 2021 19:29:38 +0000 (20:29 +0100)
committerAndrey Alekseenko <al42and@gmail.com>
Sun, 14 Mar 2021 15:33:30 +0000 (15:33 +0000)
Instead of recording internally into the PME stream and sending that
event to the PP rank to sycn on from the separate PME rank, use the
already existing event recorded in PME.
This also eliminates the unnecessary use of multiple events, one for
each PP rank.

Refs #2891 #2915

src/gromacs/ewald/pme_force_sender_gpu.h
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.h
src/gromacs/ewald/pme_only.cpp

index bcc3b1e39330cd530bdcfef9b76000c6d0b34639..081ba454e63ae688f05f470d7f7f4781e05970ba 100644 (file)
@@ -47,7 +47,7 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
-class DeviceStream;
+class GpuEventSynchronizer;
 
 /*! \libinternal
  * \brief Contains information about the PP ranks that partner this PME rank. */
@@ -72,11 +72,11 @@ class PmeForceSenderGpu
 
 public:
     /*! \brief Creates PME GPU Force sender object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     * \param[in] pmeForcesReady  Event synchronizer marked when PME forces are ready on the GPU
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~PmeForceSenderGpu();
 
     /*! \brief
@@ -86,7 +86,7 @@ public:
     void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
-     * Send PP data to PP rank
+     * Send force synchronizer to PP rank
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFToPpCudaDirect(int ppRank);
index 915d0953ed7e292d048fb7a9d0cc51aab18de5a7..8d8b97f5c5712330f90a6688c697b91bc260b133 100644 (file)
@@ -62,7 +62,7 @@ class PmeForceSenderGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
+PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* /*pmeForcesReady */,
                                      MPI_Comm /* comm     */,
                                      gmx::ArrayRef<PpRanks> /* ppRanks */) :
     impl_(nullptr)
index 07d37dcd7e31dc12ff0c40eeb96d4088a1a9cbdf..44a2e30de33111941ea49abaa02b5d6d7dcc0078 100644 (file)
@@ -55,8 +55,10 @@ namespace gmx
 {
 
 /*! \brief Create PME-PP GPU communication object */
-PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(pmeStream),
+PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
+                              MPI_Comm               comm,
+                              gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeForcesReady_(pmeForcesReady),
     comm_(comm),
     ppRanks_(ppRanks)
 {
@@ -88,28 +90,24 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
     }
 }
 
-/*! \brief Send PME data directly using CUDA memory copy */
+/*! \brief Send PME synchronizer directly using CUDA memory copy */
 void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank)
 {
     // Data will be pulled directly from PP task
-
-    // Record and send event to ensure PME force calcs are completed before PP task pulls data
-    pmeSync_.markEvent(pmeStream_);
-    GpuEventSynchronizer* pmeSyncPtr = &pmeSync_;
 #if GMX_MPI
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
-    MPI_Send(&pmeSyncPtr, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+    MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
 #else
     GMX_UNUSED_VALUE(pmeSyncPtr);
     GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
-PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
+PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
                                      MPI_Comm               comm,
                                      gmx::ArrayRef<PpRanks> ppRanks) :
-    impl_(new Impl(pmeStream, comm, ppRanks))
+    impl_(new Impl(pmeForcesReady, comm, ppRanks))
 {
 }
 
index 70be40cc7f927b9f971d95d3a928f58a227403b3..ad9718c4685b55d18cdf47edd78ff7993de562c7 100644 (file)
@@ -57,11 +57,11 @@ class PmeForceSenderGpu::Impl
 
 public:
     /*! \brief Creates PME GPU Force sender object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     * \param[in] pmeForcesReady  Event synchronizer marked when PME forces are ready on the GPU
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~Impl();
 
     /*! \brief
@@ -71,16 +71,14 @@ public:
     void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
-     * Send PP data to PP rank
+     * Send force synchronizer to PP rank
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFToPpCudaDirect(int ppRank);
 
 private:
-    //! CUDA stream for PME operations
-    const DeviceStream& pmeStream_;
-    //! Event triggered when to allow remote PP stream to syn with pme stream
-    GpuEventSynchronizer pmeSync_;
+    //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
+    GpuEventSynchronizer* pmeForcesReady_;
     //! communicator for simulation
     MPI_Comm comm_;
     //! list of PP ranks
index 5ef702e44295c1e81263b567949c26807e9fc554..e0574957121c552cb1c0f49dc3e2c202195b6a4c 100644 (file)
@@ -648,9 +648,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                     pme_pp->mpi_comm_mysim,
                     pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
-                    pme_pp->mpi_comm_mysim,
-                    pme_pp->ppRanks);
+                    pme_gpu_get_f_ready_synchronizer(pme), pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
         }
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.