Remove thread-MPI limitation for GPU PP Halo exchange
authorGaurav Garg <gaugarg@nvidia.com>
Thu, 25 Mar 2021 08:19:37 +0000 (13:49 +0530)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 13 Apr 2021 09:58:47 +0000 (09:58 +0000)
Allows use of direct-GPU communication for PP halo exchange when
running with "real" MPI, including on multiple compute nodes,
through new CUDA-aware MPI communication code paths.

Implements part of #2891
Refs: #2915 #3960

src/gromacs/domdec/domdec.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/domdec/gpuhaloexchange_impl.cuh
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/taskassignment/decidegpuusage.h

index 8bec7e2409ae6ab005b135b80fe086769c868611..088189b6027ef82b8c483c9c1b4754725e2b9a47 100644 (file)
@@ -3191,7 +3191,7 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
                     cr.dd,
                     d,
-                    cr.mpi_comm_mysim,
+                    cr.mpi_comm_mygroup,
                     deviceStreamManager.context(),
                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
index ebc98e784feca58b79c89a32edd86b416d27ccd8..65af08d35d73aee7b8a594a0f832962e9bdd44e1 100644 (file)
@@ -206,47 +206,77 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
         copyToDeviceBuffer(
                 &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
     }
-    // This rank will push data to its neighbor, so needs to know
-    // the remote receive address and similarly send its receive
-    // address to other neighbour. We can do this here in reinit fn
-    // since the pointers will not change until the next NS step.
 
-    // Coordinates buffer:
-    void* recvPtr = static_cast<void*>(&d_x_[atomOffset_]);
 #if GMX_MPI
-    MPI_Sendrecv(&recvPtr,
-                 sizeof(void*),
+    // Exchange of remote addresses from neighboring ranks is needed only with CUDA-direct as cudamemcpy needs both src/dst pointer
+    // MPI calls such as MPI_send doesn't worry about receiving address, that is taken care by MPI_recv call in neighboring rank
+    if (GMX_THREAD_MPI)
+    {
+        // This rank will push data to its neighbor, so needs to know
+        // the remote receive address and similarly send its receive
+        // address to other neighbour. We can do this here in reinit fn
+        // since the pointers will not change until the next NS step.
+
+        // Coordinates buffer:
+        float3* recvPtr = &d_x_[atomOffset_];
+        MPI_Sendrecv(&recvPtr,
+                     sizeof(void*),
+                     MPI_BYTE,
+                     recvRankX_,
+                     0,
+                     &remoteXPtr_,
+                     sizeof(void*),
+                     MPI_BYTE,
+                     sendRankX_,
+                     0,
+                     mpi_comm_mysim_,
+                     MPI_STATUS_IGNORE);
+
+        // Force buffer:
+        recvPtr = d_recvBuf_;
+        MPI_Sendrecv(&recvPtr,
+                     sizeof(void*),
+                     MPI_BYTE,
+                     recvRankF_,
+                     0,
+                     &remoteFPtr_,
+                     sizeof(void*),
+                     MPI_BYTE,
+                     sendRankF_,
+                     0,
+                     mpi_comm_mysim_,
+                     MPI_STATUS_IGNORE);
+    }
+#endif
+
+    wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
+    wallcycle_stop(wcycle_, ewcDOMDEC);
+
+    return;
+}
+
+void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    GMX_ASSERT(coordinatesReadyOnDeviceEvent != nullptr,
+               "Co-ordinate Halo exchange requires valid co-ordinate ready event");
+
+    // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
+    // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
+    // Similarly send event to task that will push data to this task.
+    GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
+    MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
+                 sizeof(GpuEventSynchronizer*),
                  MPI_BYTE,
                  recvRankX_,
                  0,
-                 &remoteXPtr_,
-                 sizeof(void*),
+                 &remoteCoordinatesReadyOnDeviceEvent,
+                 sizeof(GpuEventSynchronizer*),
                  MPI_BYTE,
                  sendRankX_,
                  0,
                  mpi_comm_mysim_,
                  MPI_STATUS_IGNORE);
-
-    // Force buffer:
-    recvPtr = static_cast<void*>(d_recvBuf_);
-    MPI_Sendrecv(&recvPtr,
-                 sizeof(void*),
-                 MPI_BYTE,
-                 recvRankF_,
-                 0,
-                 &remoteFPtr_,
-                 sizeof(void*),
-                 MPI_BYTE,
-                 sendRankF_,
-                 0,
-                 mpi_comm_mysim_,
-                 MPI_STATUS_IGNORE);
-#endif
-
-    wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
-    wallcycle_stop(wcycle_, ewcDOMDEC);
-
-    return;
+    remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
 }
 
 void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
@@ -305,7 +335,15 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
     wallcycle_start(wcycle_, ewcMOVEX);
 
-    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
+    // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
+    // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
+    if (GMX_THREAD_MPI && pulse_ == 0)
+    {
+        enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
+    }
+
+    float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
+    communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
 
     wallcycle_stop(wcycle_, ewcMOVEX);
 
@@ -320,8 +358,10 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
     wallcycle_start(wcycle_, ewcMOVEF);
 
+    float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
+
     // Communicate halo data (in non-local stream)
-    communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
+    communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
 
     wallcycle_stop(wcycle_, ewcMOVEF);
 
@@ -386,65 +426,62 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
-
-void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
-                                                HaloQuantity          haloQuantity,
-                                                GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
+                                                int     sendSize,
+                                                int     sendRank,
+                                                float3* recvPtr,
+                                                int     recvSize,
+                                                int     recvRank)
 {
-
-    void* sendPtr;
-    int   sendSize;
-    void* remotePtr;
-    int   sendRank;
-    int   recvRank;
-
-    if (haloQuantity == HaloQuantity::HaloCoordinates)
+    if (GMX_THREAD_MPI)
     {
-        sendPtr   = static_cast<void*>(d_sendBuf_);
-        sendSize  = xSendSize_;
-        remotePtr = remoteXPtr_;
-        sendRank  = sendRankX_;
-        recvRank  = recvRankX_;
-
-#if GMX_MPI
-        // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
-        // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
-        // Similarly send event to task that will push data to this task.
-        GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
-        MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
-                     sizeof(GpuEventSynchronizer*),
-                     MPI_BYTE,
-                     recvRank,
-                     0,
-                     &remoteCoordinatesReadyOnDeviceEvent,
-                     sizeof(GpuEventSynchronizer*),
-                     MPI_BYTE,
-                     sendRank,
-                     0,
-                     mpi_comm_mysim_,
-                     MPI_STATUS_IGNORE);
-        remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
-#else
-        GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
-#endif
+        // no need to explicitly sync with GMX_THREAD_MPI as all operations are
+        // anyway launched in correct stream
+        communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, recvPtr, recvRank);
     }
     else
     {
-        sendPtr   = static_cast<void*>(&(d_ptr[atomOffset_]));
-        sendSize  = fSendSize_;
-        remotePtr = remoteFPtr_;
-        sendRank  = sendRankF_;
-        recvRank  = recvRankF_;
+        communicateHaloDataWithCudaMPI(sendPtr, sendSize, sendRank, recvPtr, recvSize, recvRank);
     }
+}
 
-    communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
+                                                           int     sendSize,
+                                                           int     sendRank,
+                                                           float3* recvPtr,
+                                                           int     recvSize,
+                                                           int     recvRank)
+{
+    // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
+    if (sendSize > 0)
+    {
+        // wait for non local stream to complete all outstanding
+        // activities, to ensure that buffer is up-to-date in GPU memory
+        // before transferring to remote rank
+
+        // ToDo: Replace stream synchronize with event synchronize
+        nonLocalStream_.synchronize();
+    }
+
+    // perform halo exchange directly in device buffers
+#if GMX_MPI
+    MPI_Request request;
+
+    // recv remote data into halo region
+    MPI_Irecv(recvPtr, recvSize * DIM, MPI_FLOAT, recvRank, 0, mpi_comm_mysim_, &request);
+
+    // send data to remote halo region
+    MPI_Send(sendPtr, sendSize * DIM, MPI_FLOAT, sendRank, 0, mpi_comm_mysim_);
+
+    MPI_Wait(&request, MPI_STATUS_IGNORE);
+#endif
 }
 
-void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
-                                                              int   sendSize,
-                                                              int   sendRank,
-                                                              void* remotePtr,
-                                                              int   recvRank)
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
+                                                              int     sendSize,
+                                                              int     sendRank,
+                                                              float3* remotePtr,
+                                                              int     recvRank)
 {
 
     cudaError_t stat;
@@ -474,6 +511,9 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
     // to its stream.
     GpuEventSynchronizer* haloDataTransferRemote;
 
+    GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
+               "Halo exchange requires valid event to synchronize data transfer initiated in "
+               "remote rank");
     haloDataTransferLaunched_->markEvent(nonLocalStream_);
 
     MPI_Sendrecv(&haloDataTransferLaunched_,
@@ -516,7 +556,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
     sendRankF_(dd->neighbor[dimIndex][0]),
     recvRankF_(dd->neighbor[dimIndex][1]),
     usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
-    haloDataTransferLaunched_(new GpuEventSynchronizer()),
+    haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
     mpi_comm_mysim_(mpi_comm_mysim),
     deviceContext_(deviceContext),
     localStream_(localStream),
@@ -525,10 +565,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
     pulse_(pulse),
     wcycle_(wcycle)
 {
-
-    GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
-                       "GPU Halo exchange is currently only supported with thread-MPI enabled");
-
     if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
     {
         gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
index 5dd619a343fe5827e93c0a9103dd2d47588734f1..89ee12a2ea5edb20efd355a20eb5b2acd1dda800 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -117,22 +117,43 @@ public:
 
 private:
     /*! \brief Data transfer wrapper for GPU halo exchange
-     * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
-     * \param [in] haloQuantity  switch on whether X or F halo exchange is being performed
-     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+     * \param [in] sendPtr      send buffer address
+     * \param [in] sendSize     number of elements to send
+     * \param [in] sendRank     rank of destination
+     * \param [in] recvPtr      receive buffer address
+     * \param [in] recvSize     number of elements to receive
+     * \param [in] recvRank     rank of source
      */
-    void communicateHaloData(float3*               d_ptr,
-                             HaloQuantity          haloQuantity,
-                             GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    void communicateHaloData(float3* sendPtr, int sendSize, int sendRank, float3* recvPtr, int recvSize, int recvRank);
 
     /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
      * \param [inout] sendPtr    address to send data from
      * \param [in] sendSize      number of atoms to be sent
      * \param [in] sendRank      rank to send data to
-     * \param [inout] remotePtr  remote address to recv data
+     * \param [in] remotePtr     remote address to recv data
      * \param [in] recvRank      rank to recv data from
      */
-    void communicateHaloDataWithCudaDirect(void* sendPtr, int sendSize, int sendRank, void* remotePtr, int recvRank);
+    void communicateHaloDataWithCudaDirect(float3* sendPtr, int sendSize, int sendRank, float3* remotePtr, int recvRank);
+
+    /*! \brief Data transfer wrapper for GPU halo exchange using MPI_send and MPI_Recv
+     * \param [in] sendPtr      send buffer address
+     * \param [in] sendSize     number of elements to send
+     * \param [in] sendRank     rank of destination
+     * \param [in] recvPtr      receive buffer address
+     * \param [in] recvSize     number of elements to receive
+     * \param [in] recvRank     rank of source
+     */
+    void communicateHaloDataWithCudaMPI(float3* sendPtr,
+                                        int     sendSize,
+                                        int     sendRank,
+                                        float3* recvPtr,
+                                        int     recvSize,
+                                        int     recvRank);
+
+    /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local
+     * stream \param [in] eventSync    event recorded when coordinates/forces are ready to device
+     */
+    void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
     //! Domain decomposition object
     gmx_domdec_t* dd_ = nullptr;
@@ -177,9 +198,9 @@ private:
     //! number of home atoms - offset of local halo region
     int numHomeAtoms_ = 0;
     //! remote GPU coordinates buffer pointer for pushing data
-    void* remoteXPtr_ = nullptr;
+    float3* remoteXPtr_ = nullptr;
     //! remote GPU force buffer pointer for pushing data
-    void* remoteFPtr_ = nullptr;
+    float3* remoteFPtr_ = nullptr;
     //! Periodic Boundary Conditions for this rank
     bool usePBC_ = false;
     //! force shift buffer on device
index 60a046fec2f9d5d139020a40a6ffd115e6c48b75..3da4bc541c493761f074a58523c5a34676cc65db 100644 (file)
@@ -1139,7 +1139,12 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
                 (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
                                                : // PME force buffer on same GPU
                          fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+
+        if (GMX_THREAD_MPI)
+        {
+            GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
+            fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+        }
     }
 
     if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
index 68da5a1b17251bb9f8576011bd5ab93efac54107..33f9145889d2ce1899c38971f230e823691561e3 100644 (file)
 #include "gromacs/utility/programcontext.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/mpiinfo.h"
 
 #include "isimulator.h"
 #include "membedholder.h"
@@ -206,13 +207,66 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
 
     devFlags.enableGpuBufferOps =
             GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
-    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
+    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
     devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
     devFlags.enableGpuPmePPComm =
             GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
 
 #pragma GCC diagnostic pop
 
+    // Direct GPU comm path is being used with CUDA_AWARE_MPI
+    // make sure underlying MPI implementation is CUDA-aware
+    if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange)
+    {
+        const bool haveDetectedCudaAwareMpi =
+                (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
+        const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
+
+        if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
+        {
+            // CUDA-aware support not detected in MPI library but, user has forced it's use
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "This run has forced use of 'CUDA-aware MPI'. "
+                            "But, GROMACS cannot determine if underlying MPI "
+                            "is CUDA-aware. GROMACS recommends use of latest openMPI version "
+                            "for CUDA-aware support. "
+                            "If you observe failures at runtime, try unsetting "
+                            "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
+        }
+
+        if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
+        {
+            devFlags.usingCudaAwareMpi = true;
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature.");
+        }
+        else
+        {
+            if (devFlags.enableGpuHaloExchange)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendTextFormatted(
+                                "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
+                                "halo exchange' feature will not be enabled as GROMACS couldn't "
+                                "detect CUDA_aware support in underlying MPI implementation.");
+                devFlags.enableGpuHaloExchange = false;
+            }
+
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
+                            "support. "
+                            "If you are certain about CUDA-aware support in your MPI library, "
+                            "you can force it's use by setting environment variable "
+                            " GMX_FORCE_CUDA_AWARE_MPI.");
+        }
+    }
+
     if (devFlags.enableGpuBufferOps)
     {
         GMX_LOG(mdlog.warning)
@@ -2051,7 +2105,14 @@ int Mdrunner::mdrunner()
     {
         physicalNodeComm.barrier();
     }
-    releaseDevice(deviceInfo);
+
+    if (!devFlags.usingCudaAwareMpi)
+    {
+        // Don't reset GPU in case of CUDA-AWARE MPI
+        // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
+        // resetting the device before MPI_Finalize() results in crashes inside UCX
+        releaseDevice(deviceInfo);
+    }
 
     /* Does what it says */
     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
index d660da7f113a9130dcd08bf6a83a0caf3ea7f4b5..98d0251c33c93fb3ab5e36bdb770a42ce248b59e 100644 (file)
@@ -88,6 +88,8 @@ struct DevelopmentFeatureFlags
     bool enableGpuHaloExchange = false;
     //! True if the PME PP direct communication GPU development feature is enabled
     bool enableGpuPmePPComm = false;
+    //! True if the CUDA-aware MPI is being used for GPU direct communication feature
+    bool usingCudaAwareMpi = false;
 };