Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / pme_coordinate_receiver_gpu_impl.cu
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index b04b3c181f9061fff1a72a2fbcb1aaabeb9b7bc6..5b1fa48adef94e3ae66b5269898606aaf3d899c3 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -43,83 +43,209 @@
   */
  #include "gmxpre.h"
  
+#include "gromacs/ewald/pme_pp_communication.h"
  #include "pme_coordinate_receiver_gpu_impl.h"
  
  #include "config.h"
  
-#include <assert.h>
-#include <stdio.h>
-
-#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_force_sender_gpu.h"
  #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
  #include "gromacs/utility/gmxmpi.h"
  
  namespace gmx
  {
  
-PmeCoordinateReceiverGpu::Impl::Impl(void *pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks)
-    : pmeStream_(*static_cast<cudaStream_t*> (pmeStream)),
-      comm_(comm),
-      ppRanks_(ppRanks)
+PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm                     comm,
+                                     const DeviceContext&         deviceContext,
+                                     gmx::ArrayRef<const PpRanks> ppRanks) :
+    comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext)
  {
-    GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
+    // Create streams to manage pipelining
+    ppCommManagers_.reserve(ppRanks.size());
+    for (auto& ppRank : ppRanks)
+    {
+        ppCommManagers_.emplace_back(PpCommManager{
+                ppRank,
+                std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false),
+                nullptr,
+                { 0, 0 } });
+    }
  }
  
  PmeCoordinateReceiverGpu::Impl::~Impl() = default;
  
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(rvec *d_x)
+void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
  {
-
-    int ind_start = 0;
-    int ind_end   = 0;
-    for (const auto &receiver : ppRanks_)
+    int indEnd = 0;
+    for (auto& ppCommManager : ppCommManagers_)
      {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
+        int indStart = indEnd;
+        indEnd       = indStart + ppCommManager.ppRank.numAtoms;
  
-        //Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*> (&d_x[ind_start]);
+        ppCommManager.atomRange = std::make_tuple(indStart, indEnd);
  
+        // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+        if (GMX_THREAD_MPI)
+        {
+            // Data will be transferred directly from GPU.
+            void* sendBuf = reinterpret_cast<void*>(&d_x[indStart]);
  #if GMX_MPI
-        MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId,
-                 0, comm_);
+            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_);
+#else
+            GMX_UNUSED_VALUE(sendBuf);
  #endif
-
+        }
      }
  }
  
-/*! \brief Receive coordinate data directly using CUDA memory copy */
-void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesFromPpCudaDirect(int ppRank)
+/*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
+void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
  {
+    GMX_ASSERT(GMX_THREAD_MPI,
+               "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for "
+               "Thread-MPI");
+
      // Data will be pushed directly from PP task
  
  #if GMX_MPI
-    // Receive event from PP task and add to PME stream, to ensure PME calculation doesn't
-    // commence until coordinate data has been transferred
-    GpuEventSynchronizer *ppSync;
-    MPI_Recv(&ppSync, sizeof(GpuEventSynchronizer*),
-             MPI_BYTE, ppRank, 0,
-             comm_, MPI_STATUS_IGNORE);
-    ppSync->enqueueWaitEvent(pmeStream_);
+    // Receive event from PP task
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
+    MPI_Irecv(&ppCommManagers_[ppRank].sync,
+              sizeof(GpuEventSynchronizer*),
+              MPI_BYTE,
+              ppRank,
+              0,
+              comm_,
+              &(requests_[ppRank]));
+#else
+    GMX_UNUSED_VALUE(ppRank);
  #endif
  }
  
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(void *pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks)
-    : impl_(new Impl(pmeStream, comm, ppRanks))
+/*! \brief Receive coordinate data using CUDA-aware MPI */
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                           int numAtoms,
+                                                                           int numBytes,
+                                                                           int ppRank)
+{
+    GMX_ASSERT(GMX_LIB_MPI,
+               "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank]));
+#else
+    GMX_UNUSED_VALUE(recvbuf);
+    GMX_UNUSED_VALUE(numAtoms);
+    GMX_UNUSED_VALUE(numBytes);
+    GMX_UNUSED_VALUE(ppRank);
+#endif
+}
+
+int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage,
+                                                                       const DeviceStream& deviceStream)
+{
+#if GMX_MPI
+    int senderRank = -1; // Rank of PP task that is associated with this invocation.
+#    if (!GMX_THREAD_MPI)
+    // Wait on data from any one of the PP sender GPUs
+    MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE);
+    GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater");
+    GMX_UNUSED_VALUE(pipelineStage);
+    GMX_UNUSED_VALUE(deviceStream);
+#    else
+    // MPI_Waitany is not available in thread-MPI. However, the
+    // MPI_Wait here is not associated with data but is host-side
+    // scheduling code to receive a CUDA event, and will be executed
+    // in advance of the actual data transfer. Therefore we can
+    // receive in order of pipeline stage, still allowing the
+    // scheduled GPU-direct comms to initiate out-of-order in their
+    // respective streams. For cases with CPU force computations, the
+    // scheduling is less asynchronous (done on a per-step basis), so
+    // host-side improvements should be investigated as tracked in
+    // issue #4047
+    senderRank = pipelineStage;
+    MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE);
+    ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream);
+#    endif
+    return senderRank;
+#endif
+}
+
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    for (int i = 0; i < static_cast<int>(ppCommManagers_.size()); i++)
+    {
+        synchronizeOnCoordinatesFromPpRank(i, deviceStream);
+    }
+}
+DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].stream.get();
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].atomRange;
+}
+
+int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks()
+{
+    return ppCommManagers_.size();
+}
+
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm               comm,
+                                                   const DeviceContext&   deviceContext,
+                                                   gmx::ArrayRef<PpRanks> ppRanks) :
+    impl_(new Impl(comm, deviceContext, ppRanks))
  {
  }
  
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec* d_x)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
+{
+    impl_->reinitCoordinateReceiver(d_x);
+}
+
+void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
+{
+    impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
+}
+
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                     int                numAtoms,
+                                                                     int                numBytes,
+                                                                     int                ppRank)
+{
+    impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
+}
+
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int                 senderIndex,
+                                                                 const DeviceStream& deviceStream)
+{
+    return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream);
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream);
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex)
+{
+    return impl_->ppCommStream(senderIndex);
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex)
  {
-    impl_->sendCoordinateBufferAddressToPpRanks(d_x);
+    return impl_->ppCommAtomRange(senderIndex);
  }
  
-void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int ppRank)
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
  {
-    impl_->receiveCoordinatesFromPpCudaDirect(ppRank);
+    return impl_->ppCommNumSenderRanks();
  }
  
-} //namespace gmx
+
+} // namespace gmx