From 13678441531dc60059df5592e54a2f74900979e6 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Thu, 25 Mar 2021 13:49:37 +0530 Subject: [PATCH] Remove thread-MPI limitation for GPU PP Halo exchange Allows use of direct-GPU communication for PP halo exchange when running with "real" MPI, including on multiple compute nodes, through new CUDA-aware MPI communication code paths. Implements part of #2891 Refs: #2915 #3960 --- src/gromacs/domdec/domdec.cpp | 2 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 212 ++++++++++++-------- src/gromacs/domdec/gpuhaloexchange_impl.cuh | 43 +++- src/gromacs/mdlib/sim_util.cpp | 7 +- src/gromacs/mdrun/runner.cpp | 65 +++++- src/gromacs/taskassignment/decidegpuusage.h | 2 + 6 files changed, 228 insertions(+), 103 deletions(-) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 8bec7e2409..088189b602 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -3191,7 +3191,7 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, cr.dd->gpuHaloExchange[d].push_back(std::make_unique( cr.dd, d, - cr.mpi_comm_mysim, + cr.mpi_comm_mygroup, deviceStreamManager.context(), deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal), deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index ebc98e784f..65af08d35d 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -206,47 +206,77 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo copyToDeviceBuffer( &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr); } - // This rank will push data to its neighbor, so needs to know - // the remote receive address and similarly send its receive - // address to other neighbour. We can do this here in reinit fn - // since the pointers will not change until the next NS step. - // Coordinates buffer: - void* recvPtr = static_cast(&d_x_[atomOffset_]); #if GMX_MPI - MPI_Sendrecv(&recvPtr, - sizeof(void*), + // Exchange of remote addresses from neighboring ranks is needed only with CUDA-direct as cudamemcpy needs both src/dst pointer + // MPI calls such as MPI_send doesn't worry about receiving address, that is taken care by MPI_recv call in neighboring rank + if (GMX_THREAD_MPI) + { + // This rank will push data to its neighbor, so needs to know + // the remote receive address and similarly send its receive + // address to other neighbour. We can do this here in reinit fn + // since the pointers will not change until the next NS step. + + // Coordinates buffer: + float3* recvPtr = &d_x_[atomOffset_]; + MPI_Sendrecv(&recvPtr, + sizeof(void*), + MPI_BYTE, + recvRankX_, + 0, + &remoteXPtr_, + sizeof(void*), + MPI_BYTE, + sendRankX_, + 0, + mpi_comm_mysim_, + MPI_STATUS_IGNORE); + + // Force buffer: + recvPtr = d_recvBuf_; + MPI_Sendrecv(&recvPtr, + sizeof(void*), + MPI_BYTE, + recvRankF_, + 0, + &remoteFPtr_, + sizeof(void*), + MPI_BYTE, + sendRankF_, + 0, + mpi_comm_mysim_, + MPI_STATUS_IGNORE); + } +#endif + + wallcycle_sub_stop(wcycle_, ewcsDD_GPU); + wallcycle_stop(wcycle_, ewcDOMDEC); + + return; +} + +void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +{ + GMX_ASSERT(coordinatesReadyOnDeviceEvent != nullptr, + "Co-ordinate Halo exchange requires valid co-ordinate ready event"); + + // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used + // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep. + // Similarly send event to task that will push data to this task. + GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent; + MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, + sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRankX_, 0, - &remoteXPtr_, - sizeof(void*), + &remoteCoordinatesReadyOnDeviceEvent, + sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); - - // Force buffer: - recvPtr = static_cast(d_recvBuf_); - MPI_Sendrecv(&recvPtr, - sizeof(void*), - MPI_BYTE, - recvRankF_, - 0, - &remoteFPtr_, - sizeof(void*), - MPI_BYTE, - sendRankF_, - 0, - mpi_comm_mysim_, - MPI_STATUS_IGNORE); -#endif - - wallcycle_sub_stop(wcycle_, ewcsDD_GPU); - wallcycle_stop(wcycle_, ewcDOMDEC); - - return; + remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); } void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box, @@ -305,7 +335,15 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync wallcycle_start(wcycle_, ewcMOVEX); - communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent); + // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls + // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated + if (GMX_THREAD_MPI && pulse_ == 0) + { + enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent); + } + + float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_]; + communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_); wallcycle_stop(wcycle_, ewcMOVEX); @@ -320,8 +358,10 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync wallcycle_start(wcycle_, ewcMOVEF); + float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_; + // Communicate halo data (in non-local stream) - communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr); + communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_); wallcycle_stop(wcycle_, ewcMOVEF); @@ -386,65 +426,62 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } - -void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, - HaloQuantity haloQuantity, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr, + int sendSize, + int sendRank, + float3* recvPtr, + int recvSize, + int recvRank) { - - void* sendPtr; - int sendSize; - void* remotePtr; - int sendRank; - int recvRank; - - if (haloQuantity == HaloQuantity::HaloCoordinates) + if (GMX_THREAD_MPI) { - sendPtr = static_cast(d_sendBuf_); - sendSize = xSendSize_; - remotePtr = remoteXPtr_; - sendRank = sendRankX_; - recvRank = recvRankX_; - -#if GMX_MPI - // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used - // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep. - // Similarly send event to task that will push data to this task. - GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent; - MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, - sizeof(GpuEventSynchronizer*), - MPI_BYTE, - recvRank, - 0, - &remoteCoordinatesReadyOnDeviceEvent, - sizeof(GpuEventSynchronizer*), - MPI_BYTE, - sendRank, - 0, - mpi_comm_mysim_, - MPI_STATUS_IGNORE); - remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); -#else - GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent); -#endif + // no need to explicitly sync with GMX_THREAD_MPI as all operations are + // anyway launched in correct stream + communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, recvPtr, recvRank); } else { - sendPtr = static_cast(&(d_ptr[atomOffset_])); - sendSize = fSendSize_; - remotePtr = remoteFPtr_; - sendRank = sendRankF_; - recvRank = recvRankF_; + communicateHaloDataWithCudaMPI(sendPtr, sendSize, sendRank, recvPtr, recvSize, recvRank); } +} - communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank); +void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr, + int sendSize, + int sendRank, + float3* recvPtr, + int recvSize, + int recvRank) +{ + // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data + if (sendSize > 0) + { + // wait for non local stream to complete all outstanding + // activities, to ensure that buffer is up-to-date in GPU memory + // before transferring to remote rank + + // ToDo: Replace stream synchronize with event synchronize + nonLocalStream_.synchronize(); + } + + // perform halo exchange directly in device buffers +#if GMX_MPI + MPI_Request request; + + // recv remote data into halo region + MPI_Irecv(recvPtr, recvSize * DIM, MPI_FLOAT, recvRank, 0, mpi_comm_mysim_, &request); + + // send data to remote halo region + MPI_Send(sendPtr, sendSize * DIM, MPI_FLOAT, sendRank, 0, mpi_comm_mysim_); + + MPI_Wait(&request, MPI_STATUS_IGNORE); +#endif } -void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, - int sendSize, - int sendRank, - void* remotePtr, - int recvRank) +void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr, + int sendSize, + int sendRank, + float3* remotePtr, + int recvRank) { cudaError_t stat; @@ -474,6 +511,9 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, // to its stream. GpuEventSynchronizer* haloDataTransferRemote; + GMX_ASSERT(haloDataTransferLaunched_ != nullptr, + "Halo exchange requires valid event to synchronize data transfer initiated in " + "remote rank"); haloDataTransferLaunched_->markEvent(nonLocalStream_); MPI_Sendrecv(&haloDataTransferLaunched_, @@ -516,7 +556,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, sendRankF_(dd->neighbor[dimIndex][0]), recvRankF_(dd->neighbor[dimIndex][1]), usePBC_(dd->ci[dd->dim[dimIndex]] == 0), - haloDataTransferLaunched_(new GpuEventSynchronizer()), + haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr), mpi_comm_mysim_(mpi_comm_mysim), deviceContext_(deviceContext), localStream_(localStream), @@ -525,10 +565,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, pulse_(pulse), wcycle_(wcycle) { - - GMX_RELEASE_ASSERT(GMX_THREAD_MPI, - "GPU Halo exchange is currently only supported with thread-MPI enabled"); - if (usePBC_ && dd->unitCellInfo.haveScrewPBC) { gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n"); diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 5dd619a343..89ee12a2ea 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -117,22 +117,43 @@ public: private: /*! \brief Data transfer wrapper for GPU halo exchange - * \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory - * \param [in] haloQuantity switch on whether X or F halo exchange is being performed - * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device + * \param [in] sendPtr send buffer address + * \param [in] sendSize number of elements to send + * \param [in] sendRank rank of destination + * \param [in] recvPtr receive buffer address + * \param [in] recvSize number of elements to receive + * \param [in] recvRank rank of source */ - void communicateHaloData(float3* d_ptr, - HaloQuantity haloQuantity, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + void communicateHaloData(float3* sendPtr, int sendSize, int sendRank, float3* recvPtr, int recvSize, int recvRank); /*! \brief Data transfer for GPU halo exchange using CUDA memcopies * \param [inout] sendPtr address to send data from * \param [in] sendSize number of atoms to be sent * \param [in] sendRank rank to send data to - * \param [inout] remotePtr remote address to recv data + * \param [in] remotePtr remote address to recv data * \param [in] recvRank rank to recv data from */ - void communicateHaloDataWithCudaDirect(void* sendPtr, int sendSize, int sendRank, void* remotePtr, int recvRank); + void communicateHaloDataWithCudaDirect(float3* sendPtr, int sendSize, int sendRank, float3* remotePtr, int recvRank); + + /*! \brief Data transfer wrapper for GPU halo exchange using MPI_send and MPI_Recv + * \param [in] sendPtr send buffer address + * \param [in] sendSize number of elements to send + * \param [in] sendRank rank of destination + * \param [in] recvPtr receive buffer address + * \param [in] recvSize number of elements to receive + * \param [in] recvRank rank of source + */ + void communicateHaloDataWithCudaMPI(float3* sendPtr, + int sendSize, + int sendRank, + float3* recvPtr, + int recvSize, + int recvRank); + + /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local + * stream \param [in] eventSync event recorded when coordinates/forces are ready to device + */ + void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); //! Domain decomposition object gmx_domdec_t* dd_ = nullptr; @@ -177,9 +198,9 @@ private: //! number of home atoms - offset of local halo region int numHomeAtoms_ = 0; //! remote GPU coordinates buffer pointer for pushing data - void* remoteXPtr_ = nullptr; + float3* remoteXPtr_ = nullptr; //! remote GPU force buffer pointer for pushing data - void* remoteFPtr_ = nullptr; + float3* remoteFPtr_ = nullptr; //! Periodic Boundary Conditions for this rank bool usePBC_ = false; //! force shift buffer on device diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 60a046fec2..3da4bc541c 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1139,7 +1139,12 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata) : // PME force buffer on same GPU fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU - fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer); + + if (GMX_THREAD_MPI) + { + GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL"); + fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer); + } } if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr)) diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 68da5a1b17..33f9145889 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -167,6 +167,7 @@ #include "gromacs/utility/programcontext.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/stringutil.h" +#include "gromacs/utility/mpiinfo.h" #include "isimulator.h" #include "membedholder.h" @@ -206,13 +207,66 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md devFlags.enableGpuBufferOps = GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); - devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr; + devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr; devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE; devFlags.enableGpuPmePPComm = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr; #pragma GCC diagnostic pop + // Direct GPU comm path is being used with CUDA_AWARE_MPI + // make sure underlying MPI implementation is CUDA-aware + if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange) + { + const bool haveDetectedCudaAwareMpi = + (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported); + const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr); + + if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi) + { + // CUDA-aware support not detected in MPI library but, user has forced it's use + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted( + "This run has forced use of 'CUDA-aware MPI'. " + "But, GROMACS cannot determine if underlying MPI " + "is CUDA-aware. GROMACS recommends use of latest openMPI version " + "for CUDA-aware support. " + "If you observe failures at runtime, try unsetting " + "GMX_FORCE_CUDA_AWARE_MPI environment variable."); + } + + if (haveDetectedCudaAwareMpi || forceCudaAwareMpi) + { + devFlags.usingCudaAwareMpi = true; + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature."); + } + else + { + if (devFlags.enableGpuHaloExchange) + { + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted( + "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU " + "halo exchange' feature will not be enabled as GROMACS couldn't " + "detect CUDA_aware support in underlying MPI implementation."); + devFlags.enableGpuHaloExchange = false; + } + + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted( + "GROMACS recommends use of latest OpenMPI version for CUDA-aware " + "support. " + "If you are certain about CUDA-aware support in your MPI library, " + "you can force it's use by setting environment variable " + " GMX_FORCE_CUDA_AWARE_MPI."); + } + } + if (devFlags.enableGpuBufferOps) { GMX_LOG(mdlog.warning) @@ -2051,7 +2105,14 @@ int Mdrunner::mdrunner() { physicalNodeComm.barrier(); } - releaseDevice(deviceInfo); + + if (!devFlags.usingCudaAwareMpi) + { + // Don't reset GPU in case of CUDA-AWARE MPI + // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize() + // resetting the device before MPI_Finalize() results in crashes inside UCX + releaseDevice(deviceInfo); + } /* Does what it says */ print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime()); diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index d660da7f11..98d0251c33 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -88,6 +88,8 @@ struct DevelopmentFeatureFlags bool enableGpuHaloExchange = false; //! True if the PME PP direct communication GPU development feature is enabled bool enableGpuPmePPComm = false; + //! True if the CUDA-aware MPI is being used for GPU direct communication feature + bool usingCudaAwareMpi = false; }; -- 2.22.0