From: Alan Gray Date: Sun, 1 Aug 2021 08:24:19 +0000 (+0000) Subject: Remove MPI comm from GPU PME-PP force transfer initiation X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=4a1ed1360c1e5bb28a9f00c06ce35e88e6ea3cf9;p=alexxy%2Fgromacs.git Remove MPI comm from GPU PME-PP force transfer initiation --- diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index 64c981ccb6..44933df9cc 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -94,10 +94,11 @@ public: /*! \brief * Send force to PP rank (used with Thread-MPI) - * \param[in] ppRank PP rank to receive data - * \param[in] numAtoms number of atoms to send + * \param[in] ppRank PP rank to receive data + * \param[in] numAtoms number of atoms to send + * \param[in] sendForcesDirectToPpGpu whether forces are transferred direct to remote GPU memory */ - void sendFToPpCudaDirect(int ppRank, int numAtoms); + void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu); /*! \brief * Send force to PP rank (used with Lib-MPI) diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index 5475c665da..1516d72d79 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -84,7 +84,9 @@ void PmeForceSenderGpu::setForceSendBuffer(DeviceBuffer /* d_f */) "correct implementation."); } -void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */, int /* numAtoms */) +void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */, + int /* numAtoms */, + bool /* sendForcesDirectToPpGpu */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 5356683083..64b3440d2e 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -59,14 +59,19 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef ppRanks) : - pmeForcesReady_(pmeForcesReady), comm_(comm), ppRanks_(ppRanks), deviceContext_(deviceContext) + pmeForcesReady_(pmeForcesReady), + comm_(comm), + ppRanks_(ppRanks), + deviceContext_(deviceContext), + ppCommStream_(ppRanks.size()), + ppCommEvent_(ppRanks.size()), + pmeRemoteGpuForcePtr_(ppRanks.size()), + pmeRemoteCpuForcePtr_(ppRanks.size()) { // Create streams and events to manage pushing of force buffers to remote PP ranks std::unique_ptr stream; std::unique_ptr event; size_t i = 0; - ppCommStream_.resize(ppRanks_.size()); - ppCommEvent_.resize(ppRanks_.size()); for (i = 0; i < ppRanks_.size(); i++) { stream = std::make_unique(deviceContext_, DeviceStreamPriority::High, false); @@ -103,8 +108,14 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer d_f) ind_start = ind_end; ind_end = ind_start + receiver.numAtoms; - localForcePtr_[i++] = &d_f[ind_start]; + localForcePtr_[i] = &d_f[ind_start]; + // NOLINTNEXTLINE(bugprone-sizeof-expression) + MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE); + // NOLINTNEXTLINE(bugprone-sizeof-expression) + MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE); + i++; } + #else GMX_UNUSED_VALUE(d_f); #endif @@ -112,16 +123,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer d_f) /*! \brief Send PME synchronizer directly using CUDA memory copy */ -void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms) +void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu) { GMX_ASSERT(GMX_THREAD_MPI, "sendFToPpCudaDirect is expected to be called only for Thread-MPI"); #if GMX_MPI - void* pmeRemoteForcePtr; - // NOLINTNEXTLINE(bugprone-sizeof-expression) - MPI_Recv(&pmeRemoteForcePtr, sizeof(void*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE); + float3* pmeRemoteForcePtr = + sendForcesDirectToPpGpu ? pmeRemoteGpuForcePtr_[ppRank] : pmeRemoteCpuForcePtr_[ppRank]; pmeForcesReady_->enqueueWaitEvent(*ppCommStream_[ppRank]); @@ -190,9 +200,9 @@ void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer sendbuf, impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request); } -void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms) +void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu) { - impl_->sendFToPpCudaDirect(ppRank, numAtoms); + impl_->sendFToPpCudaDirect(ppRank, numAtoms, sendForcesDirectToPpGpu); } diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 74954bdf1f..5575f03b07 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -80,10 +80,11 @@ public: /*! \brief * Send force to PP rank (used with Thread-MPI) - * \param[in] ppRank PP rank to receive data - * \param[in] numAtoms number of atoms to send + * \param[in] ppRank PP rank to receive data + * \param[in] numAtoms number of atoms to send + * \param[in] sendForcesDirectToPpGpu whether forces are transferred direct to remote GPU memory */ - void sendFToPpCudaDirect(int ppRank, int numAtoms); + void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu); /*! \brief * Send force to PP rank (used with Lib-MPI) @@ -110,6 +111,10 @@ private: std::vector> localForcePtr_; //! GPU context handle (not used in CUDA) const DeviceContext& deviceContext_; + //! Vector of CPU force buffer pointers for multiple remote PP tasks + std::vector pmeRemoteCpuForcePtr_; + //! Vector of GPU force buffer pointers for multiple remote PP tasks + std::vector pmeRemoteGpuForcePtr_; }; } // namespace gmx diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 50f5b48878..0e67991867 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -134,6 +134,8 @@ struct gmx_pme_pp /*! \brief whether GPU direct communications are active for PME-PP transfers */ bool useGpuDirectComm = false; + /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */ + bool sendForcesDirectToPpGpu = false; }; /*! \brief Initialize the PME-only side of the PME <-> PP communication */ @@ -288,6 +290,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr), "The use of GPU direct communication for PME-PP is enabled, " "but the PME GPU force reciever object does not exist"); + pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0); if (cnb.flags & PP_PME_FINISH) { @@ -558,7 +561,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme, if (GMX_THREAD_MPI) { - pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId, receiver.numAtoms); + pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect( + receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu); } else { diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp index e8433399a2..3061b0afce 100644 --- a/src/gromacs/ewald/pme_pp.cpp +++ b/src/gromacs/ewald/pme_pp.cpp @@ -113,6 +113,7 @@ static void gmx_pme_send_coeffs_coords(t_forcerec* fr, bool useGpuPmePpComms, bool reinitGpuPmePpComms, bool sendCoordinatesFromGpu, + bool receiveForcesToGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { gmx_domdec_t* dd; @@ -138,6 +139,10 @@ static void gmx_pme_send_coeffs_coords(t_forcerec* fr, if (useGpuPmePpComms) { flags |= PP_PME_GPUCOMMS; + if (receiveForcesToGpu) + { + flags |= PP_PME_RECVFTOGPU; + } } if (c_useDelayedWait) @@ -257,6 +262,8 @@ static void gmx_pme_send_coeffs_coords(t_forcerec* fr, { if (reinitGpuPmePpComms) { + std::vector& buffer = cr->dd->pmeForceReceiveBuffer; + buffer.resize(n); fr->pmePpCommGpu->reinit(n); } @@ -357,6 +364,7 @@ void gmx_pme_send_parameters(const t_commrec* cr, false, false, false, + false, nullptr); } @@ -371,6 +379,7 @@ void gmx_pme_send_coordinates(t_forcerec* fr, bool useGpuPmePpComms, bool receiveCoordinateAddressFromPme, bool sendCoordinatesFromGpu, + bool receiveForcesToGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent, gmx_wallcycle* wcycle) { @@ -400,6 +409,7 @@ void gmx_pme_send_coordinates(t_forcerec* fr, useGpuPmePpComms, receiveCoordinateAddressFromPme, sendCoordinatesFromGpu, + receiveForcesToGpu, coordinatesReadyOnDeviceEvent); wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX); @@ -410,7 +420,7 @@ void gmx_pme_send_finish(const t_commrec* cr) unsigned int flags = PP_PME_FINISH; gmx_pme_send_coeffs_coords( - nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef(), 0, 0, 0, 0, -1, false, false, false, nullptr); + nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef(), 0, 0, 0, 0, -1, false, false, false, false, nullptr); } void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj) diff --git a/src/gromacs/ewald/pme_pp.h b/src/gromacs/ewald/pme_pp.h index 683dfc379d..6c8402311f 100644 --- a/src/gromacs/ewald/pme_pp.h +++ b/src/gromacs/ewald/pme_pp.h @@ -88,6 +88,7 @@ void gmx_pme_send_coordinates(t_forcerec* fr, bool useGpuPmePpComms, bool reinitGpuPmePpComms, bool sendCoordinatesFromGpu, + bool receiveForcesToGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent, gmx_wallcycle* wcycle); diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h index 454da6d659..a11f185a12 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu.h @@ -43,6 +43,7 @@ #define GMX_PME_PP_COMM_GPU_H #include +#include #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" @@ -66,12 +67,17 @@ class PmePpCommGpu public: /*! \brief Creates PME-PP GPU communication object - * \param[in] comm Communicator used for simulation - * \param[in] pmeRank Rank of PME task - * \param[in] deviceContext GPU context. - * \param[in] deviceStream GPU stream. + * \param[in] comm Communicator used for simulation + * \param[in] pmeRank Rank of PME task + * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory + * \param[in] deviceContext GPU context. + * \param[in] deviceStream GPU stream. */ - PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream); + PmePpCommGpu(MPI_Comm comm, + int pmeRank, + std::vector& pmeCpuForceBuffer, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream); ~PmePpCommGpu(); /*! \brief Perform steps required when buffer size changes diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp index e7339f9c7e..0a5c60a4dd 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp @@ -64,6 +64,7 @@ class PmePpCommGpu::Impl /*!\brief Constructor stub. */ PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, + std::vector& /* pmeCpuForceBuffer */, const DeviceContext& /* deviceContext */, const DeviceStream& /* deviceStream */) : impl_(nullptr) diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 8acb6aa671..50e9a4189f 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -59,14 +59,16 @@ namespace gmx { -PmePpCommGpu::Impl::Impl(MPI_Comm comm, - int pmeRank, - const DeviceContext& deviceContext, - const DeviceStream& deviceStream) : +PmePpCommGpu::Impl::Impl(MPI_Comm comm, + int pmeRank, + std::vector& pmeCpuForceBuffer, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream) : deviceContext_(deviceContext), pmePpCommStream_(deviceStream), comm_(comm), pmeRank_(pmeRank), + pmeCpuForceBuffer_(pmeCpuForceBuffer), d_pmeForces_(nullptr) { } @@ -75,36 +77,32 @@ PmePpCommGpu::Impl::~Impl() = default; void PmePpCommGpu::Impl::reinit(int size) { + // Reallocate device buffer used for staging PME force + reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_); + // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses. #if GMX_MPI if (GMX_THREAD_MPI) { - // receive device buffer address from PME rank + // receive device coordinate buffer address from PME rank MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); + // send host and device force buffer addresses to PME rank + MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_); + RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_.data(); + MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_); } #endif - - // Reallocate buffer used for staging PME force on GPU - reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_); } // TODO make this asynchronous by splitting into this into // launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect() -void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu) +void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu) { #if GMX_MPI // Remote PME task pushes GPU data directly data to this PP task. - void* localForcePtr = receivePmeForceToGpu ? static_cast(d_pmeForces_) : recvPtr; - - // Send destination pointer to PME task. Do this every step since - // PME task is agostic as to whether destination is PP CPU or - // GPU. - // NOLINTNEXTLINE(bugprone-sizeof-expression) - MPI_Send(&localForcePtr, sizeof(void*), MPI_BYTE, pmeRank_, 0, comm_); - // Recieve event from PME task after PME->PP force data push has // been scheduled and enqueue this to PP stream. GpuEventSynchronizer* eventptr; @@ -143,7 +141,7 @@ void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr; if (GMX_THREAD_MPI) { - receiveForceFromPmeCudaDirect(pmeForcePtr, receivePmeForceToGpu); + receiveForceFromPmeCudaDirect(receivePmeForceToGpu); } else { @@ -221,11 +219,12 @@ GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer() } } -PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, - int pmeRank, - const DeviceContext& deviceContext, - const DeviceStream& deviceStream) : - impl_(new Impl(comm, pmeRank, deviceContext, deviceStream)) +PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, + int pmeRank, + std::vector& pmeCpuForceBuffer, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream) : + impl_(new Impl(comm, pmeRank, pmeCpuForceBuffer, deviceContext, deviceStream)) { } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index 3d3039db6d..f62faea933 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -58,12 +58,17 @@ class PmePpCommGpu::Impl public: /*! \brief Creates PME-PP GPU communication object. * - * \param[in] comm Communicator used for simulation - * \param[in] pmeRank Rank of PME task - * \param[in] deviceContext GPU context. - * \param[in] deviceStream GPU stream. + * \param[in] comm Communicator used for simulation + * \param[in] pmeRank Rank of PME task + * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory + * \param[in] deviceContext GPU context. + * \param[in] deviceStream GPU stream. */ - Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream); + Impl(MPI_Comm comm, + int pmeRank, + std::vector& pmeCpuForceBuffer, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream); ~Impl(); /*! \brief Perform steps required when buffer size changes @@ -115,10 +120,9 @@ private: /*! \brief Pull force buffer directly from GPU memory on PME * rank to either GPU or CPU memory on PP task using CUDA * Memory copy. This method is used with Thread-MPI. - * \param[out] recvPtr CPU buffer to receive PME force data * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU */ - void receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu); + void receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu); /*! \brief Pull force buffer directly from GPU memory on PME * rank to either GPU or CPU memory on PP task using CUDA-aware @@ -160,6 +164,8 @@ private: MPI_Comm comm_; //! Rank of PME task int pmeRank_ = -1; + //! Buffer for PME force on CPU + std::vector& pmeCpuForceBuffer_; //! Buffer for staging PME force on GPU DeviceBuffer d_pmeForces_; //! number of atoms in PME force staging array diff --git a/src/gromacs/ewald/pme_pp_communication.h b/src/gromacs/ewald/pme_pp_communication.h index d5fbd960d9..184ae94865 100644 --- a/src/gromacs/ewald/pme_pp_communication.h +++ b/src/gromacs/ewald/pme_pp_communication.h @@ -85,6 +85,8 @@ enum #define PP_PME_SWITCHGRID (1 << 11) #define PP_PME_RESETCOUNTERS (1 << 12) #define PP_PME_GPUCOMMS (1 << 13) +// Whether PME forces are transferred directly to remote PP GPU memory in a specific step +#define PP_PME_RECVFTOGPU (1 << 14) //@} /*! \brief Return values for gmx_pme_recv_q_x */ diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 4210f696cd..563a36239e 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1374,6 +1374,7 @@ void do_force(FILE* fplog, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms, pmeSendCoordinatesFromGpu, + stepWork.useGpuPmeFReduction, localXReadyOnDevice, wcycle); } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 9da0886029..42bab51cb8 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1649,6 +1649,7 @@ int Mdrunner::mdrunner() fr->pmePpCommGpu = std::make_unique( cr->mpi_comm_mysim, cr->dd->pme_nodeid, + cr->dd->pmeForceReceiveBuffer, deviceStreamManager->context(), deviceStreamManager->stream(DeviceStreamType::PmePpTransfer)); }