From: Szilárd Páll Date: Fri, 5 Mar 2021 19:29:38 +0000 (+0100) Subject: Use existing PME f ready event in PmeForceSenderGpu X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=39b9e167ce6a9a0eeaca8460bcbce3d3e8202c4b;p=alexxy%2Fgromacs.git Use existing PME f ready event in PmeForceSenderGpu Instead of recording internally into the PME stream and sending that event to the PP rank to sycn on from the separate PME rank, use the already existing event recorded in PME. This also eliminates the unnecessary use of multiple events, one for each PP rank. Refs #2891 #2915 --- diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index bcc3b1e393..081ba454e6 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -47,7 +47,7 @@ #include "gromacs/math/vectypes.h" #include "gromacs/utility/gmxmpi.h" -class DeviceStream; +class GpuEventSynchronizer; /*! \libinternal * \brief Contains information about the PP ranks that partner this PME rank. */ @@ -72,11 +72,11 @@ class PmeForceSenderGpu public: /*! \brief Creates PME GPU Force sender object - * \param[in] pmeStream CUDA stream used for PME computations + * \param[in] pmeForcesReady Event synchronizer marked when PME forces are ready on the GPU * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef ppRanks); ~PmeForceSenderGpu(); /*! \brief @@ -86,7 +86,7 @@ public: void sendForceBufferAddressToPpRanks(rvec* d_f); /*! \brief - * Send PP data to PP rank + * Send force synchronizer to PP rank * \param[in] ppRank PP rank to receive data */ void sendFToPpCudaDirect(int ppRank); diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index 915d0953ed..8d8b97f5c5 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -62,7 +62,7 @@ class PmeForceSenderGpu::Impl }; /*!\brief Constructor stub. */ -PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */, +PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* /*pmeForcesReady */, MPI_Comm /* comm */, gmx::ArrayRef /* ppRanks */) : impl_(nullptr) diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 07d37dcd7e..44a2e30de3 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -55,8 +55,10 @@ namespace gmx { /*! \brief Create PME-PP GPU communication object */ -PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : - pmeStream_(pmeStream), +PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer* pmeForcesReady, + MPI_Comm comm, + gmx::ArrayRef ppRanks) : + pmeForcesReady_(pmeForcesReady), comm_(comm), ppRanks_(ppRanks) { @@ -88,28 +90,24 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f) } } -/*! \brief Send PME data directly using CUDA memory copy */ +/*! \brief Send PME synchronizer directly using CUDA memory copy */ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank) { // Data will be pulled directly from PP task - - // Record and send event to ensure PME force calcs are completed before PP task pulls data - pmeSync_.markEvent(pmeStream_); - GpuEventSynchronizer* pmeSyncPtr = &pmeSync_; #if GMX_MPI // TODO Using MPI_Isend would be more efficient, particularly when // sending to multiple PP ranks - MPI_Send(&pmeSyncPtr, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_); + MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_); #else GMX_UNUSED_VALUE(pmeSyncPtr); GMX_UNUSED_VALUE(ppRank); #endif } -PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& pmeStream, +PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef ppRanks) : - impl_(new Impl(pmeStream, comm, ppRanks)) + impl_(new Impl(pmeForcesReady, comm, ppRanks)) { } diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 70be40cc7f..ad9718c468 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -57,11 +57,11 @@ class PmeForceSenderGpu::Impl public: /*! \brief Creates PME GPU Force sender object - * \param[in] pmeStream CUDA stream used for PME computations + * \param[in] pmeForcesReady Event synchronizer marked when PME forces are ready on the GPU * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef ppRanks); ~Impl(); /*! \brief @@ -71,16 +71,14 @@ public: void sendForceBufferAddressToPpRanks(rvec* d_f); /*! \brief - * Send PP data to PP rank + * Send force synchronizer to PP rank * \param[in] ppRank PP rank to receive data */ void sendFToPpCudaDirect(int ppRank); private: - //! CUDA stream for PME operations - const DeviceStream& pmeStream_; - //! Event triggered when to allow remote PP stream to syn with pme stream - GpuEventSynchronizer pmeSync_; + //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream + GpuEventSynchronizer* pmeForcesReady_; //! communicator for simulation MPI_Comm comm_; //! list of PP ranks diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 5ef702e442..e057495712 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -648,9 +648,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, pme_pp->mpi_comm_mysim, pme_pp->ppRanks); pme_pp->pmeForceSenderGpu = std::make_unique( - deviceStreamManager->stream(gmx::DeviceStreamType::Pme), - pme_pp->mpi_comm_mysim, - pme_pp->ppRanks); + pme_gpu_get_f_ready_synchronizer(pme), pme_pp->mpi_comm_mysim, pme_pp->ppRanks); } // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here. // This should be made safer.