From e0da8cce105ec43120c59a6d64611cc3f4f2c610 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Wed, 29 Sep 2021 14:10:58 +0000
Subject: [PATCH] Avoid MPI sync for PME force sender GPU scheduling code and
 thread API calls

Replaces synchronous PME-PP MPI comms of event at every step with
exchange of event address and associated flag only on search
steps. The PP rank now ensures that event has been recorded before
enqueueing by spinning on flag written by PME rank in shared CPU
memory. This allows not only async progress by PME rank, but also
OpenMP parallelization of cudaMemcpy launches to the multiple PP
ranks, such that the CUDA API overheads will overlap.

Partly addresses #4047
---
 .../ewald/pme_force_sender_gpu_impl.cu        | 15 ++++-
 src/gromacs/ewald/pme_force_sender_gpu_impl.h | 17 +++++
 src/gromacs/ewald/pme_only.cpp                | 67 +++++++++++--------
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     | 21 +++---
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |  6 ++
 5 files changed, 86 insertions(+), 40 deletions(-)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index e037679cec..066763f8f0 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -64,6 +64,7 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
     ppRanks_(ppRanks),
     ppCommStream_(ppRanks.size()),
     ppCommEvent_(ppRanks.size()),
+    ppCommEventRecorded_(ppRanks.size()),
     deviceContext_(deviceContext),
     pmeRemoteCpuForcePtr_(ppRanks.size()),
     pmeRemoteGpuForcePtr_(ppRanks.size())
@@ -113,6 +114,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
         MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
         // NOLINTNEXTLINE(bugprone-sizeof-expression)
         MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        // Send address of event and associated flag to PP rank, to allow remote enqueueing
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&ppCommEvent_[i], sizeof(GpuEventSynchronizer*), MPI_BYTE, receiver.rankId, 0, comm_);
+
+        std::atomic<bool>* tmpPpCommEventRecordedPtr =
+                reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[i]));
+        tmpPpCommEventRecordedPtr->store(false, std::memory_order_release);
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&tmpPpCommEventRecordedPtr, sizeof(std::atomic<bool>*), MPI_BYTE, receiver.rankId, 0, comm_);
         i++;
     }
 
@@ -142,8 +152,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool
                                        ppCommStream_[ppRank]->stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
     ppCommEvent_[ppRank]->markEvent(*ppCommStream_[ppRank]);
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Send(&ppCommEvent_[ppRank], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+    std::atomic<bool>* tmpPpCommEventRecordedPtr =
+            reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[ppRank]));
+    tmpPpCommEventRecordedPtr->store(true, std::memory_order_release);
 #else
     GMX_UNUSED_VALUE(ppRank);
     GMX_UNUSED_VALUE(numAtoms);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index 5575f03b07..6517d82bb0 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -43,11 +43,21 @@
 #ifndef GMX_PMEFORCESENDERGPU_IMPL_H
 #define GMX_PMEFORCESENDERGPU_IMPL_H
 
+#include <atomic>
+#include <new>
+
 #include "gromacs/ewald/pme_force_sender_gpu.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/utility/arrayref.h"
 
+// Portable definition of cache line size
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -55,6 +65,11 @@ namespace gmx
 
 /*! \internal \brief Class with interfaces and data for CUDA version of PME Force sending functionality*/
 
+typedef struct CacheLineAlignedFlag
+{
+    alignas(hardware_destructive_interference_size) bool flag;
+} CacheLineAlignedFlag;
+
 class PmeForceSenderGpu::Impl
 {
 
@@ -107,6 +122,8 @@ private:
     std::vector<std::unique_ptr<DeviceStream>> ppCommStream_;
     //! Events used for manging sync with remote PP ranks
     std::vector<std::unique_ptr<GpuEventSynchronizer>> ppCommEvent_;
+    //! Vector of flags to track when PP transfer events have been recorded
+    std::vector<std::atomic<CacheLineAlignedFlag>> ppCommEventRecorded_;
     //! Addresses of local force buffers to send to remote PP ranks
     std::vector<DeviceBuffer<RVec>> localForcePtr_;
     //! GPU context handle (not used in CUDA)
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 8e33845218..3151ec1ca7 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -101,6 +101,7 @@
 #include "gromacs/utility/smalloc.h"
 
 #include "pme_gpu_internal.h"
+#include "pme_internal.h"
 #include "pme_output.h"
 #include "pme_pp_communication.h"
 
@@ -546,46 +547,54 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
     int                    messages, ind_start, ind_end;
     cve.cycles = cycles;
 
-    /* Now the evaluated forces have to be transferred to the PP nodes */
+    if (pme_pp->useGpuDirectComm)
+    {
+        GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+                   "The use of GPU direct communication for PME-PP is enabled, "
+                   "but the PME GPU force reciever object does not exist");
+    }
+
     messages = 0;
     ind_end  = 0;
-    for (const auto& receiver : pme_pp->ppRanks)
+
+    /* Now the evaluated forces have to be transferred to the PP ranks */
+    if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
     {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
-        if (pme_pp->useGpuDirectComm)
+        int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+#    pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+        for (int i = 0; i < numPpRanks; i++)
         {
-            GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
-                       "The use of GPU direct communication for PME-PP is enabled, "
-                       "but the PME GPU force reciever object does not exist");
-
-            if (GMX_THREAD_MPI)
-            {
-                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
-                        receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
-            }
-            else
+            auto& receiver = pme_pp->ppRanks[i];
+            pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                    receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
+        }
+    }
+    else
+    {
+        for (const auto& receiver : pme_pp->ppRanks)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
+            if (pme_pp->useGpuDirectComm)
             {
                 pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
                                                             ind_start,
                                                             receiver.numAtoms * sizeof(rvec),
                                                             receiver.rankId,
                                                             &pme_pp->req[messages]);
-
-                messages++;
             }
-        }
-        else
-        {
-            void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
-            // Send using MPI
-            MPI_Isend(sendbuf,
-                      receiver.numAtoms * sizeof(rvec),
-                      MPI_BYTE,
-                      receiver.rankId,
-                      0,
-                      pme_pp->mpi_comm_mysim,
-                      &pme_pp->req[messages]);
+            else
+            {
+                void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+                // Send using MPI
+                MPI_Isend(sendbuf,
+                          receiver.numAtoms * sizeof(rvec),
+                          MPI_BYTE,
+                          receiver.rankId,
+                          0,
+                          pme_pp->mpi_comm_mysim,
+                          &pme_pp->req[messages]);
+            }
             messages++;
         }
     }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 1ec7104ef1..8c29a1cf1b 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -91,24 +91,27 @@ void PmePpCommGpu::Impl::reinit(int size)
         MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
         RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_->data();
         MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
+        // Receive address of event and associated flag from PME rank, to allow sync to local stream after force transfer
+        MPI_Recv(&remotePmeForceSendEvent_, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        MPI_Recv(&remotePmeForceSendEventRecorded_, sizeof(std::atomic<bool>*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     }
 
 #endif
 }
 
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
 void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
 {
 #if GMX_MPI
-    // Remote PME task pushes GPU data directly data to this PP task.
+    // Wait until remote PME task has pushed data, and then enqueue remote event to local stream.
 
-    // Recieve event from PME task after PME->PP force data push has
-    // been scheduled and enqueue this to PP stream.
-    GpuEventSynchronizer* eventptr;
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Recv(&eventptr, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
-    eventptr->enqueueWaitEvent(pmePpCommStream_);
+    // Spin until PME rank sets flag
+    while (!(remotePmeForceSendEventRecorded_->load(std::memory_order_acquire))) {};
+
+    // Enqueue remote event
+    remotePmeForceSendEvent_->enqueueWaitEvent(pmePpCommStream_);
+
+    // Reset the flag
+    remotePmeForceSendEventRecorded_->store(false, std::memory_order_release);
 
     if (receivePmeForceToGpu)
     {
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 13b72cc66a..f4ec9214e8 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -43,6 +43,8 @@
 #ifndef GMX_PME_PP_COMM_GPU_IMPL_H
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
+#include <atomic>
+
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.h"
 #include "gromacs/math/vectypes.h"
@@ -176,6 +178,10 @@ private:
     GpuEventSynchronizer forcesReadySynchronizer_;
     //! Event recorded when coordinates have been transferred to PME task
     GpuEventSynchronizer pmeCoordinatesSynchronizer_;
+    //! Event recorded by remote PME task when forces have been transferred
+    GpuEventSynchronizer* remotePmeForceSendEvent_;
+    //! Flag to track when remote PP event has been recorded, ready for enqueueing
+    volatile std::atomic<bool>* remotePmeForceSendEventRecorded_;
 };
 
 } // namespace gmx
-- 
2.22.0