From 7f2aeec0cb3aee77b7be2c14be7ec63315b87045 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Wed, 14 Apr 2021 11:20:07 +0530 Subject: [PATCH] Remove thread-MPI limitation for GPU direct PME-PP communication Allows use of direct-GPU communication for PP-PME communication when running with "real" MPI, including on multiple compute nodes, through new CUDA-aware MPI communication code paths. Implements part of #2891 Refs: #2915 #3960 --- docs/install-guide/index.rst | 26 ++++ .../ewald/pme_coordinate_receiver_gpu.h | 14 ++- .../pme_coordinate_receiver_gpu_impl.cpp | 12 +- .../ewald/pme_coordinate_receiver_gpu_impl.cu | 78 +++++++++--- .../ewald/pme_coordinate_receiver_gpu_impl.h | 14 ++- src/gromacs/ewald/pme_force_sender_gpu.h | 12 +- .../ewald/pme_force_sender_gpu_impl.cpp | 11 ++ .../ewald/pme_force_sender_gpu_impl.cu | 57 +++++++-- src/gromacs/ewald/pme_force_sender_gpu_impl.h | 12 +- src/gromacs/ewald/pme_only.cpp | 49 +++++--- src/gromacs/ewald/pme_only.h | 3 +- src/gromacs/ewald/pme_pp.cpp | 19 +-- src/gromacs/ewald/pme_pp_comm_gpu.h | 20 ++- src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp | 20 +-- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 119 +++++++++++++----- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 65 +++++++--- src/gromacs/ewald/pme_pp_communication.h | 3 +- src/gromacs/mdrun/runner.cpp | 21 +++- 18 files changed, 431 insertions(+), 124 deletions(-) diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst index c9666bf246..27859f51f7 100644 --- a/docs/install-guide/index.rst +++ b/docs/install-guide/index.rst @@ -223,6 +223,32 @@ and add ``-DGMX_MPI=on`` to the cmake options. It is possible to set the compiler to the MPI compiler wrapper but it is neither necessary nor recommended. +CUDA-Aware MPI support +~~~~~~~~~~~~~~~~~~~~~~ + +In simulations using multiple NVIDIA GPUs, an MPI implementation with CUDA support +(also called "CUDA-aware") allows communication to be performed directly between the +distinct GPU memory spaces without staging through CPU memory, often +resulting in higher bandwidth and lower latency communication. For +more details, see `Introduction to CUDA-aware MPI +`_. + +To use CUDA-aware MPI for direct GPU communication we recommend +using the latest OpenMPI version (>=4.1.0) with the latest UCX version +(>=1.10), since most GROMACS internal testing on CUDA-aware support has +been performed using these versions. OpenMPI with CUDA-aware support can +be built following the procedure in `these OpenMPI build instructions +`_. + +With ``GPU_MPI=ON``, GROMACS attempts to automatically detect CUDA support +in the underlying MPI library at compile time, and enables direct GPU +communication when this is detected. However, there are some cases when +GROMACS may fail to detect existing CUDA-aware support, in which case +it can be manually enabled by setting environment variable ``GMX_FORCE_CUDA_AWARE_MPI=1`` +at runtime (although such cases still lack substantial +testing, so we urge the user to carefully check correctness of results +against those using default build options, and report any issues). + CMake ^^^^^ diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h index 94aefe8501..81f640df40 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h @@ -83,9 +83,19 @@ public: void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank); /*! \brief - * enqueue wait for coordinate data from PP ranks + * Used for lib MPI, receives co-ordinates from PP ranks + * \param[in] recvbuf coordinates buffer in GPU memory + * \param[in] numAtoms starting element in buffer + * \param[in] numBytes number of bytes to transfer + * \param[in] ppRank PP rank to send data */ - void enqueueWaitReceiveCoordinatesFromPpCudaDirect(); + void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, int numAtoms, int numBytes, int ppRank); + + /*! \brief + * For lib MPI, wait for coordinates from PP ranks + * For thread MPI, enqueue PP co-ordinate transfer event into PME stream + */ + void synchronizeOnCoordinatesFromPpRanks(); private: class Impl; diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp index ae68eedca2..4e997d319b 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp @@ -89,7 +89,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in "implementation."); } -void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect() +void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer /* recvbuf */, + int /* numAtoms */, + int /* numBytes */, + int /* ppRank */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); +} + +void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks() { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index 10f48f1d97..7fa2122dfb 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -43,6 +43,7 @@ */ #include "gmxpre.h" +#include "gromacs/ewald/pme_pp_communication.h" #include "pme_coordinate_receiver_gpu_impl.h" #include "config.h" @@ -62,9 +63,6 @@ PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream& pmeStream, comm_(comm), ppRanks_(ppRanks) { - GMX_RELEASE_ASSERT( - GMX_THREAD_MPI, - "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); request_.resize(ppRanks.size()); ppSync_.resize(ppRanks.size()); } @@ -73,28 +71,34 @@ PmeCoordinateReceiverGpu::Impl::~Impl() = default; void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) { - - int ind_start = 0; - int ind_end = 0; - for (const auto& receiver : ppRanks_) + // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy + if (GMX_THREAD_MPI) { - ind_start = ind_end; - ind_end = ind_start + receiver.numAtoms; - - // Data will be transferred directly from GPU. - void* sendBuf = reinterpret_cast(&d_x[ind_start]); + int ind_start = 0; + int ind_end = 0; + for (const auto& receiver : ppRanks_) + { + ind_start = ind_end; + ind_end = ind_start + receiver.numAtoms; + // Data will be transferred directly from GPU. + void* sendBuf = reinterpret_cast(&d_x[ind_start]); #if GMX_MPI - MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_); + MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_); #else - GMX_UNUSED_VALUE(sendBuf); + GMX_UNUSED_VALUE(sendBuf); #endif + } } } /*! \brief Receive coordinate synchronizer pointer from the PP ranks. */ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank) { + GMX_ASSERT(GMX_THREAD_MPI, + "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for " + "Thread-MPI"); + // Data will be pushed directly from PP task #if GMX_MPI @@ -106,18 +110,44 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir #endif } -void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect() +/*! \brief Receive coordinate data using CUDA-aware MPI */ +void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, + int numAtoms, + int numBytes, + int ppRank) +{ + GMX_ASSERT(GMX_LIB_MPI, + "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI"); + +#if GMX_MPI + MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]); +#else + GMX_UNUSED_VALUE(recvbuf); + GMX_UNUSED_VALUE(numAtoms); + GMX_UNUSED_VALUE(numBytes); + GMX_UNUSED_VALUE(ppRank); +#endif +} + +void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks() { if (recvCount_ > 0) { - // ensure PME calculation doesn't commence until coordinate data has been transferred + // ensure PME calculation doesn't commence until coordinate data/remote events + // has been transferred #if GMX_MPI MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE); #endif - for (int i = 0; i < recvCount_; i++) + + // Make PME stream wait on PP to PME data trasnfer events + if (GMX_THREAD_MPI) { - ppSync_[i]->enqueueWaitEvent(pmeStream_); + for (int i = 0; i < recvCount_; i++) + { + ppSync_[i]->enqueueWaitEvent(pmeStream_); + } } + // reset receive counter recvCount_ = 0; } @@ -142,9 +172,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank); } -void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect() +void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, + int numAtoms, + int numBytes, + int ppRank) +{ + impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank); +} + +void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks() { - impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect(); + impl_->synchronizeOnCoordinatesFromPpRanks(); } } // namespace gmx diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h index 0f4ca21fa3..604079c0b0 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h @@ -79,9 +79,19 @@ public: void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank); /*! \brief - * enqueue wait for coordinate data from PP ranks + * Used for lib MPI, receives co-ordinates from PP ranks + * \param[in] recvbuf coordinates buffer in GPU memory + * \param[in] numAtoms starting element in buffer + * \param[in] numBytes number of bytes to transfer + * \param[in] ppRank PP rank to send data */ - void enqueueWaitReceiveCoordinatesFromPpCudaDirect(); + void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, int numAtoms, int numBytes, int ppRank); + + /*! \brief + * For lib MPI, wait for coordinates from PP ranks + * For thread MPI, enqueue PP co-ordinate transfer event into PME stream + */ + void synchronizeOnCoordinatesFromPpRanks(); private: //! CUDA stream for PME operations diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index edced61504..e06f582ae8 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -87,11 +87,21 @@ public: void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); /*! \brief - * Send force synchronizer to PP rank + * Send force synchronizer to PP rank (used with Thread-MPI) * \param[in] ppRank PP rank to receive data */ void sendFSynchronizerToPpCudaDirect(int ppRank); + /*! \brief + * Send force to PP rank (used with Lib-MPI) + * \param[in] sendbuf force buffer in GPU memory + * \param[in] offset starting element in buffer + * \param[in] numBytes number of bytes to transfer + * \param[in] ppRank PP rank to receive data + * \param[in] request MPI request to track asynchronous MPI call status + */ + void sendFToPpCudaMpi(DeviceBuffer sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request); + private: class Impl; std::unique_ptr impl_; diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index 0365a5563c..f7cd9c5cc4 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -90,6 +90,17 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int /* ppRank */) "implementation."); } +void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer /* sendbuf */, + int /* offset */, + int /* numBytes */, + int /* ppRank */, + MPI_Request* /* request */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); +} + } // namespace gmx #endif // !GMX_GPU_CUDA diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 753ac483f5..b124c03136 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -62,9 +62,6 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer* pmeForcesReady, comm_(comm), ppRanks_(ppRanks) { - GMX_RELEASE_ASSERT( - GMX_THREAD_MPI, - "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); } PmeForceSenderGpu::Impl::~Impl() = default; @@ -72,6 +69,13 @@ PmeForceSenderGpu::Impl::~Impl() = default; /*! \brief sends force buffer address to PP ranks */ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer d_f) { + // Need to send address to PP rank only for thread-MPI as PP rank pulls + // data using cudamemcpy + if (!GMX_THREAD_MPI) + { + return; + } +#if GMX_MPI int ind_start = 0; int ind_end = 0; for (const auto& receiver : ppRanks_) @@ -80,30 +84,56 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer(&d_f[ind_start]); + Float3* sendBuf = &d_f[ind_start]; -#if GMX_MPI - MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_); + MPI_Send(&sendBuf, sizeof(Float3*), MPI_BYTE, receiver.rankId, 0, comm_); + } #else - GMX_UNUSED_VALUE(sendBuf); + GMX_UNUSED_VALUE(d_f); #endif - } } /*! \brief Send PME synchronizer directly using CUDA memory copy */ void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank) { + GMX_ASSERT(GMX_THREAD_MPI, + "sendFSynchronizerToPpCudaDirect is expected to be called only for Thread-MPI"); + // Data will be pulled directly from PP task #if GMX_MPI // TODO Using MPI_Isend would be more efficient, particularly when // sending to multiple PP ranks MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_); #else - GMX_UNUSED_VALUE(pmeSyncPtr); GMX_UNUSED_VALUE(ppRank); #endif } +/*! \brief Send PME data directly using CUDA-aware MPI */ +void PmeForceSenderGpu::Impl::sendFToPpCudaMpi(DeviceBuffer sendbuf, + int offset, + int numBytes, + int ppRank, + MPI_Request* request) +{ + GMX_ASSERT(GMX_LIB_MPI, "sendFToPpCudaMpi is expected to be called only for Lib-MPI"); + +#if GMX_MPI + // if using GPU direct comm with CUDA-aware MPI, make sure forces are ready on device + // before sending it to PP ranks + pmeForcesReady_->waitForEvent(); + + MPI_Isend(sendbuf[offset], numBytes, MPI_BYTE, ppRank, 0, comm_, request); + +#else + GMX_UNUSED_VALUE(sendbuf); + GMX_UNUSED_VALUE(offset); + GMX_UNUSED_VALUE(numBytes); + GMX_UNUSED_VALUE(ppRank); + GMX_UNUSED_VALUE(request); +#endif +} + PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef ppRanks) : @@ -123,5 +153,14 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int ppRank) impl_->sendFSynchronizerToPpCudaDirect(ppRank); } +void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer sendbuf, + int offset, + int numBytes, + int ppRank, + MPI_Request* request) +{ + impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request); +} + } // namespace gmx diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 9ff0a15a9a..0e0ad8122c 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -74,11 +74,21 @@ public: void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); /*! \brief - * Send force synchronizer to PP rank + * Send force synchronizer to PP rank (used with Thread-MPI) * \param[in] ppRank PP rank to receive data */ void sendFSynchronizerToPpCudaDirect(int ppRank); + /*! \brief + * Send force to PP rank (used with Lib-MPI) + * \param[in] sendbuf force buffer in GPU memory + * \param[in] offset starting element in buffer + * \param[in] numBytes number of bytes to transfer + * \param[in] ppRank PP rank to receive data + * \param[in] request MPI request to track asynchronous MPI call status + */ + void sendFToPpCudaMpi(DeviceBuffer sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request); + private: //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream GpuEventSynchronizer* pmeForcesReady_; diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 5130034a9f..64f685ab44 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -104,10 +104,6 @@ #include "pme_output.h" #include "pme_pp_communication.h" -/*! \brief environment variable to enable GPU P2P communication */ -static const bool c_enableGpuPmePpComms = - GMX_GPU_CUDA && GMX_THREAD_MPI && (getenv("GMX_GPU_PME_PP_COMMS") != nullptr); - /*! \brief Master PP-PME communication data structure */ struct gmx_pme_pp { @@ -466,8 +462,16 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, { if (pme_pp->useGpuDirectComm) { - pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect( - sender.rankId); + if (GMX_THREAD_MPI) + { + pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect( + sender.rankId); + } + else + { + pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi( + stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId); + } } else { @@ -493,7 +497,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, if (pme_pp->useGpuDirectComm) { - pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect(); + pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks(); } status = pmerecvqxX; @@ -531,7 +535,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, } /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */ -static void gmx_pme_send_force_vir_ener(gmx_pme_pp* pme_pp, +static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme, + gmx_pme_pp* pme_pp, const PmeOutput& output, real dvdlambda_q, real dvdlambda_lj, @@ -547,18 +552,32 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp* pme_pp, ind_end = 0; for (const auto& receiver : pme_pp->ppRanks) { - ind_start = ind_end; - ind_end = ind_start + receiver.numAtoms; - void* sendbuf = const_cast(static_cast(output.forces_[ind_start])); + ind_start = ind_end; + ind_end = ind_start + receiver.numAtoms; if (pme_pp->useGpuDirectComm) { GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr), "The use of GPU direct communication for PME-PP is enabled, " "but the PME GPU force reciever object does not exist"); - pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId); + + if (GMX_THREAD_MPI) + { + pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId); + } + else + { + pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme), + ind_start, + receiver.numAtoms * sizeof(rvec), + receiver.rankId, + &pme_pp->req[messages]); + + messages++; + } } else { + void* sendbuf = const_cast(static_cast(output.forces_[ind_start])); // Send using MPI MPI_Isend(sendbuf, receiver.numAtoms * sizeof(rvec), @@ -593,6 +612,7 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp* pme_pp, MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data()); #else GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener"); + GMX_UNUSED_VALUE(pme); GMX_UNUSED_VALUE(pme_pp); GMX_UNUSED_VALUE(output); GMX_UNUSED_VALUE(dvdlambda_q); @@ -608,6 +628,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, gmx_walltime_accounting_t walltime_accounting, t_inputrec* ir, PmeRunMode runMode, + bool useGpuPmePpCommunication, const gmx::DeviceStreamManager* deviceStreamManager) { int ret; @@ -640,7 +661,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, "Device stream can not be nullptr when using GPU in PME-only rank"); changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy()); changePinningPolicy(&pme_pp->x, pme_get_pinning_policy()); - if (c_enableGpuPmePpComms) + if (useGpuPmePpCommunication) { pme_pp->pmeCoordinateReceiverGpu = std::make_unique( deviceStreamManager->stream(gmx::DeviceStreamType::Pme), @@ -780,7 +801,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, } cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh); - gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles); + gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles); count++; } /***** end of quasi-loop, we stop with the break above */ diff --git a/src/gromacs/ewald/pme_only.h b/src/gromacs/ewald/pme_only.h index 1a71ea195c..150a5dad55 100644 --- a/src/gromacs/ewald/pme_only.h +++ b/src/gromacs/ewald/pme_only.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -69,6 +69,7 @@ int gmx_pmeonly(gmx_pme_t* pme, gmx_walltime_accounting_t walltime_accounting, t_inputrec* ir, PmeRunMode runMode, + bool useGpuPmePpCommunication, const gmx::DeviceStreamManager* deviceStreamManager); #endif diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp index 63693ed6a6..11ca5faf72 100644 --- a/src/gromacs/ewald/pme_pp.cpp +++ b/src/gromacs/ewald/pme_pp.cpp @@ -265,11 +265,16 @@ static void gmx_pme_send_coeffs_coords(t_forcerec* fr, real* xRealPtr = const_cast(x[0]); if (useGpuPmePpComms && (fr != nullptr)) { - void* sendPtr = sendCoordinatesFromGpu - ? static_cast(fr->stateGpu->getCoordinates()) - : static_cast(xRealPtr); - fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect( - sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent); + if (sendCoordinatesFromGpu) + { + fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu( + fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent); + } + else + { + fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu( + reinterpret_cast(xRealPtr), n, coordinatesReadyOnDeviceEvent); + } } else { @@ -509,8 +514,8 @@ static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu, if (useGpuPmePpComms) { GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu"); - // Receive directly using CUDA memory copy - pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu); + // Receive forces from PME rank + pmePpCommGpu->receiveForceFromPme(static_cast(recvptr), n, receivePmeForceToGpu); } else { diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h index 886e0c221b..1e4e614cd9 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu.h @@ -45,6 +45,7 @@ #include #include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/math/vectypes.h" #include "gromacs/utility/gmxmpi.h" class DeviceContext; @@ -84,18 +85,25 @@ public: * \param[in] recvSize Number of elements to receive * \param[in] recvPmeForceToGpu Whether receive is to GPU, otherwise CPU */ - void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool recvPmeForceToGpu); + void receiveForceFromPme(RVec* recvPtr, int recvSize, bool recvPmeForceToGpu); /*! \brief Push coordinates buffer directly to GPU memory on PME task * \param[in] sendPtr Buffer with coordinate data * \param[in] sendSize Number of elements to send - * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device */ - void sendCoordinatesToPmeCudaDirect(void* sendPtr, - int sendSize, - bool sendPmeCoordinatesFromGpu, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + void sendCoordinatesToPmeFromGpu(DeviceBuffer sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + + /*! \brief Push coordinates buffer from host memory directly to GPU memory on PME task + * \param[in] sendPtr Buffer with coordinate data + * \param[in] sendSize Number of elements to send + * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device + */ + void sendCoordinatesToPmeFromCpu(RVec* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); /*! \brief * Return pointer to buffer used for staging PME force on GPU diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp index d31b976c10..e7339f9c7e 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp @@ -83,19 +83,25 @@ void PmePpCommGpu::reinit(int /* size */) "correct implementation."); } -void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* /* recvPtr */, - int /* recvSize */, - bool /* receivePmeForceToGpu */) +void PmePpCommGpu::receiveForceFromPme(RVec* /* recvPtr */, int /* recvSize */, bool /* receivePmeForceToGpu */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " "implementation."); } -void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */, - int /* sendSize */, - bool /* sendPmeCoordinatesFromGpu */, - GpuEventSynchronizer* /* coordinatesOnDeviceEvent */) +void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer /* sendPtr */, + int /* sendSize */, + GpuEventSynchronizer* /* coordinatesOnDeviceEvent */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); +} + +void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* /* sendPtr */, + int /* sendSize */, + GpuEventSynchronizer* /* coordinatesOnDeviceEvent */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index cb9e787c44..2e242a074f 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -43,6 +43,7 @@ */ #include "gmxpre.h" +#include "gromacs/ewald/pme_pp_communication.h" #include "pme_pp_comm_gpu_impl.h" #include "config.h" @@ -52,6 +53,7 @@ #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/utility/gmxmpi.h" namespace gmx @@ -67,9 +69,6 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, pmeRank_(pmeRank), d_pmeForces_(nullptr) { - GMX_RELEASE_ASSERT( - GMX_THREAD_MPI, - "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); } PmePpCommGpu::Impl::~Impl() = default; @@ -78,20 +77,22 @@ void PmePpCommGpu::Impl::reinit(int size) { // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses. #if GMX_MPI - MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); - MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); + + if (GMX_THREAD_MPI) + { + // receive device buffer address from PME rank + MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); + MPI_Recv(&remotePmeFBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); + } + +#endif // Reallocate buffer used for staging PME force on GPU reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_); -#else - GMX_UNUSED_VALUE(size); -#endif return; } -// TODO make this asynchronous by splitting into this into -// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect() -void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu) +void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu) { #if GMX_MPI // Receive event from PME task and add to stream, to ensure pull of data doesn't @@ -99,10 +100,10 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi GpuEventSynchronizer* pmeSync; MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); pmeSync->enqueueWaitEvent(pmePpCommStream_); +#endif // Pull force data from remote GPU - void* pmeForcePtr = receivePmeForceToGpu ? static_cast(d_pmeForces_) : recvPtr; - cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, + cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, remotePmeFBuffer_, recvSize * DIM * sizeof(float), cudaMemcpyDefault, @@ -120,21 +121,37 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi { // Ensure CPU waits for PME forces to be copied before reducing // them with other forces on the CPU - cudaStreamSynchronize(pmePpCommStream_.stream()); + pmePpCommStream_.synchronize(); } +} + +void PmePpCommGpu::Impl::receiveForceFromPmeCudaMpi(float3* pmeForcePtr, int recvSize) +{ +#if GMX_MPI + MPI_Recv(pmeForcePtr, recvSize * DIM, MPI_FLOAT, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); #else - GMX_UNUSED_VALUE(recvPtr); + GMX_UNUSED_VALUE(pmeForcePtr); GMX_UNUSED_VALUE(recvSize); - GMX_UNUSED_VALUE(receivePmeForceToGpu); #endif } -void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr, - int sendSize, - bool gmx_unused sendPmeCoordinatesFromGpu, +void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu) +{ + float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr; + if (GMX_THREAD_MPI) + { + receiveForceFromPmeCudaDirect(pmeForcePtr, recvSize, receivePmeForceToGpu); + } + else + { + receiveForceFromPmeCudaMpi(pmeForcePtr, recvSize); + } +} + +void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3* sendPtr, + int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { -#if GMX_MPI // ensure stream waits until coordinate data is available on device coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_); @@ -145,18 +162,44 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr, pmePpCommStream_.stream()); CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed"); +#if GMX_MPI // Record and send event to allow PME task to sync to above transfer before commencing force calculations pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_); GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_; MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_); +#endif +} + +void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaMpi(float3* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +{ + // ensure coordinate data is available on device before we start transfer + coordinatesReadyOnDeviceEvent->waitForEvent(); + +#if GMX_MPI + float3* sendptr_x = sendPtr; + + MPI_Send(sendptr_x, sendSize * DIM, MPI_FLOAT, pmeRank_, eCommType_COORD_GPU, comm_); #else GMX_UNUSED_VALUE(sendPtr); GMX_UNUSED_VALUE(sendSize); - GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu); - GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent); #endif } +void PmePpCommGpu::Impl::sendCoordinatesToPme(float3* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +{ + if (GMX_THREAD_MPI) + { + sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, coordinatesReadyOnDeviceEvent); + } + else + { + sendCoordinatesToPmeCudaMpi(sendPtr, sendSize, coordinatesReadyOnDeviceEvent); + } +} DeviceBuffer PmePpCommGpu::Impl::getGpuForceStagingPtr() { return d_pmeForces_; @@ -164,7 +207,14 @@ DeviceBuffer PmePpCommGpu::Impl::getGpuForceStagingPtr() GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer() { - return &forcesReadySynchronizer_; + if (GMX_THREAD_MPI) + { + return &forcesReadySynchronizer_; + } + else + { + return nullptr; + } } PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, @@ -182,21 +232,26 @@ void PmePpCommGpu::reinit(int size) impl_->reinit(size); } -void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu) +void PmePpCommGpu::receiveForceFromPme(RVec* recvPtr, int recvSize, bool receivePmeForceToGpu) +{ + impl_->receiveForceFromPme(asFloat3(recvPtr), recvSize, receivePmeForceToGpu); +} + +void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { - impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu); + impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent); } -void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* sendPtr, - int sendSize, - bool sendPmeCoordinatesFromGpu, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { - impl_->sendCoordinatesToPmeCudaDirect( - sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent); + impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent); } -DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() +DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() { return impl_->getGpuForceStagingPtr(); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index 70ef8f937c..d4ee85872e 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -44,9 +44,7 @@ #define GMX_PME_PP_COMM_GPU_IMPL_H #include "gromacs/ewald/pme_pp_comm_gpu.h" -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" -#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/gmxmpi.h" @@ -75,7 +73,7 @@ public: /*! \brief Pull force buffer directly from GPU memory on PME * rank to either GPU or CPU memory on PP task using CUDA - * Memory copy. + * Memory copy or CUDA-aware MPI. * * recvPtr should be in GPU or CPU memory if recvPmeForceToGpu * is true or false, respectively. If receiving to GPU, this @@ -89,25 +87,20 @@ public: * \param[in] recvSize Number of elements to receive * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU */ - void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu); + void receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu); /*! \brief Push coordinates buffer directly to GPU memory on PME * task, from either GPU or CPU memory on PP task using CUDA - * Memory copy. sendPtr should be in GPU or CPU memory if - * sendPmeCoordinatesFromGpu is true or false respectively. If - * sending from GPU, this method should be called after the - * local GPU coordinate buffer operations. The remote PME task will - * automatically wait for data to be copied before commencing PME force calculations. + * Memory copy or CUDA-aware MPI. If sending from GPU, this method should + * be called after the local GPU coordinate buffer operations. + * The remote PME task will automatically wait for data to be copied + * before commencing PME force calculations. * \param[in] sendPtr Buffer with coordinate data * \param[in] sendSize Number of elements to send - * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device */ - void sendCoordinatesToPmeCudaDirect(void* sendPtr, - int sendSize, - bool sendPmeCoordinatesFromGpu, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + void sendCoordinatesToPme(float3* sendPtr, int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); /*! \brief * Return pointer to buffer used for staging PME force on GPU @@ -119,15 +112,55 @@ public: */ GpuEventSynchronizer* getForcesReadySynchronizer(); +private: + /*! \brief Pull force buffer directly from GPU memory on PME + * rank to either GPU or CPU memory on PP task using CUDA + * Memory copy. This method is used with Thread-MPI. + * \param[out] recvPtr CPU buffer to receive PME force data + * \param[in] recvSize Number of elements to receive + * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU + */ + void receiveForceFromPmeCudaDirect(float3* recvPtr, int recvSize, bool receivePmeForceToGpu); + + /*! \brief Pull force buffer directly from GPU memory on PME + * rank to either GPU or CPU memory on PP task using CUDA-aware + * MPI. This method is used with process-MPI. + * \param[out] recvPtr CPU buffer to receive PME force data + * \param[in] recvSize Number of elements to receive + */ + void receiveForceFromPmeCudaMpi(float3* recvPtr, int recvSize); + + /*! \brief Push coordinates buffer directly to GPU memory on PME + * task, from either GPU or CPU memory on PP task using CUDA Memory copy. + * This method is used with Thread-MPI. + * \param[in] sendPtr Buffer with coordinate data + * \param[in] sendSize Number of elements to send + * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device + */ + void sendCoordinatesToPmeCudaDirect(float3* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + + /*! \brief Push coordinates buffer directly to GPU memory on PME + * task, from either GPU or CPU memory on PP task using CUDA-aware MPI. + * This method is used with process-MPI. + * \param[in] sendPtr Buffer with coordinate data + * \param[in] sendSize Number of elements to send + * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device + */ + void sendCoordinatesToPmeCudaMpi(float3* sendPtr, + int sendSize, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + private: //! GPU context handle (not used in CUDA) const DeviceContext& deviceContext_; //! Handle for CUDA stream used for the communication operations in this class const DeviceStream& pmePpCommStream_; //! Remote location of PME coordinate data buffer - void* remotePmeXBuffer_ = nullptr; + float3* remotePmeXBuffer_ = nullptr; //! Remote location of PME force data buffer - void* remotePmeFBuffer_ = nullptr; + float3* remotePmeFBuffer_ = nullptr; //! communicator for simulation MPI_Comm comm_; //! Rank of PME task diff --git a/src/gromacs/ewald/pme_pp_communication.h b/src/gromacs/ewald/pme_pp_communication.h index 329284d4e4..9b6c5a9ce9 100644 --- a/src/gromacs/ewald/pme_pp_communication.h +++ b/src/gromacs/ewald/pme_pp_communication.h @@ -4,7 +4,7 @@ * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -62,6 +62,7 @@ enum eCommType_SigmaB, eCommType_NR, eCommType_COORD, + eCommType_COORD_GPU, eCommType_CNB }; diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 45fbace426..e53d3c16c5 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -209,14 +209,13 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr; devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE; - devFlags.enableGpuPmePPComm = - GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr; + devFlags.enableGpuPmePPComm = GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr; #pragma GCC diagnostic pop // Direct GPU comm path is being used with CUDA_AWARE_MPI // make sure underlying MPI implementation is CUDA-aware - if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange) + if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange)) { const bool haveDetectedCudaAwareMpi = (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported); @@ -241,7 +240,9 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md devFlags.usingCudaAwareMpi = true; GMX_LOG(mdlog.warning) .asParagraph() - .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature."); + .appendTextFormatted( + "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP " + "communications' feature."); } else { @@ -255,6 +256,17 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md "detect CUDA_aware support in underlying MPI implementation."); devFlags.enableGpuHaloExchange = false; } + if (devFlags.enableGpuPmePPComm) + { + GMX_LOG(mdlog.warning) + .asParagraph() + .appendText( + "GMX_GPU_PME_PP_COMMS environment variable detected, but the " + "'GPU PME-PP communications' feature will not be enabled as " + "GROMACS couldn't " + "detect CUDA_aware support in underlying MPI implementation."); + devFlags.enableGpuPmePPComm = false; + } GMX_LOG(mdlog.warning) .asParagraph() @@ -2037,6 +2049,7 @@ int Mdrunner::mdrunner() walltime_accounting, inputrec.get(), pmeRunMode, + runScheduleWork.simulationWork.useGpuPmePpCommunication, deviceStreamManager.get()); } -- 2.22.0