Avoid MPI sync for PME force sender GPU scheduling code and thread API calls
authorAlan Gray <alangray3@gmail.com>
Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
committerAndrey Alekseenko <al42and@gmail.com>
Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
Replaces synchronous PME-PP MPI comms of event at every step with
exchange of event address and associated flag only on search
steps. The PP rank now ensures that event has been recorded before
enqueueing by spinning on flag written by PME rank in shared CPU
memory. This allows not only async progress by PME rank, but also
OpenMP parallelization of cudaMemcpy launches to the multiple PP
ranks, such that the CUDA API overheads will overlap.

Partly addresses #4047

src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index e037679cec188cd38457229ffd16f39bc194b272..066763f8f007bb144b241787b2786f2024a52142 100644 (file)
@@ -64,6 +64,7 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
     ppRanks_(ppRanks),
     ppCommStream_(ppRanks.size()),
     ppCommEvent_(ppRanks.size()),
+    ppCommEventRecorded_(ppRanks.size()),
     deviceContext_(deviceContext),
     pmeRemoteCpuForcePtr_(ppRanks.size()),
     pmeRemoteGpuForcePtr_(ppRanks.size())
@@ -113,6 +114,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
         MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
         // NOLINTNEXTLINE(bugprone-sizeof-expression)
         MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        // Send address of event and associated flag to PP rank, to allow remote enqueueing
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&ppCommEvent_[i], sizeof(GpuEventSynchronizer*), MPI_BYTE, receiver.rankId, 0, comm_);
+
+        std::atomic<bool>* tmpPpCommEventRecordedPtr =
+                reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[i]));
+        tmpPpCommEventRecordedPtr->store(false, std::memory_order_release);
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&tmpPpCommEventRecordedPtr, sizeof(std::atomic<bool>*), MPI_BYTE, receiver.rankId, 0, comm_);
         i++;
     }
 
@@ -142,8 +152,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool
                                        ppCommStream_[ppRank]->stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
     ppCommEvent_[ppRank]->markEvent(*ppCommStream_[ppRank]);
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Send(&ppCommEvent_[ppRank], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+    std::atomic<bool>* tmpPpCommEventRecordedPtr =
+            reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[ppRank]));
+    tmpPpCommEventRecordedPtr->store(true, std::memory_order_release);
 #else
     GMX_UNUSED_VALUE(ppRank);
     GMX_UNUSED_VALUE(numAtoms);
index 5575f03b071e5e888ceb441bb3f6bb787987957e..6517d82bb0dde049b1d030a5e1010b4ff89c1fd0 100644 (file)
 #ifndef GMX_PMEFORCESENDERGPU_IMPL_H
 #define GMX_PMEFORCESENDERGPU_IMPL_H
 
+#include <atomic>
+#include <new>
+
 #include "gromacs/ewald/pme_force_sender_gpu.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/utility/arrayref.h"
 
+// Portable definition of cache line size
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -55,6 +65,11 @@ namespace gmx
 
 /*! \internal \brief Class with interfaces and data for CUDA version of PME Force sending functionality*/
 
+typedef struct CacheLineAlignedFlag
+{
+    alignas(hardware_destructive_interference_size) bool flag;
+} CacheLineAlignedFlag;
+
 class PmeForceSenderGpu::Impl
 {
 
@@ -107,6 +122,8 @@ private:
     std::vector<std::unique_ptr<DeviceStream>> ppCommStream_;
     //! Events used for manging sync with remote PP ranks
     std::vector<std::unique_ptr<GpuEventSynchronizer>> ppCommEvent_;
+    //! Vector of flags to track when PP transfer events have been recorded
+    std::vector<std::atomic<CacheLineAlignedFlag>> ppCommEventRecorded_;
     //! Addresses of local force buffers to send to remote PP ranks
     std::vector<DeviceBuffer<RVec>> localForcePtr_;
     //! GPU context handle (not used in CUDA)
index 8e33845218145d6a0b85ff6138c275301cc3bb02..3151ec1ca719e906edeae6acc140d6a03f73b51a 100644 (file)
 #include "gromacs/utility/smalloc.h"
 
 #include "pme_gpu_internal.h"
+#include "pme_internal.h"
 #include "pme_output.h"
 #include "pme_pp_communication.h"
 
@@ -546,46 +547,54 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
     int                    messages, ind_start, ind_end;
     cve.cycles = cycles;
 
-    /* Now the evaluated forces have to be transferred to the PP nodes */
+    if (pme_pp->useGpuDirectComm)
+    {
+        GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+                   "The use of GPU direct communication for PME-PP is enabled, "
+                   "but the PME GPU force reciever object does not exist");
+    }
+
     messages = 0;
     ind_end  = 0;
-    for (const auto& receiver : pme_pp->ppRanks)
+
+    /* Now the evaluated forces have to be transferred to the PP ranks */
+    if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
     {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
-        if (pme_pp->useGpuDirectComm)
+        int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+#    pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+        for (int i = 0; i < numPpRanks; i++)
         {
-            GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
-                       "The use of GPU direct communication for PME-PP is enabled, "
-                       "but the PME GPU force reciever object does not exist");
-
-            if (GMX_THREAD_MPI)
-            {
-                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
-                        receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
-            }
-            else
+            auto& receiver = pme_pp->ppRanks[i];
+            pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                    receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
+        }
+    }
+    else
+    {
+        for (const auto& receiver : pme_pp->ppRanks)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
+            if (pme_pp->useGpuDirectComm)
             {
                 pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
                                                             ind_start,
                                                             receiver.numAtoms * sizeof(rvec),
                                                             receiver.rankId,
                                                             &pme_pp->req[messages]);
-
-                messages++;
             }
-        }
-        else
-        {
-            void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
-            // Send using MPI
-            MPI_Isend(sendbuf,
-                      receiver.numAtoms * sizeof(rvec),
-                      MPI_BYTE,
-                      receiver.rankId,
-                      0,
-                      pme_pp->mpi_comm_mysim,
-                      &pme_pp->req[messages]);
+            else
+            {
+                void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+                // Send using MPI
+                MPI_Isend(sendbuf,
+                          receiver.numAtoms * sizeof(rvec),
+                          MPI_BYTE,
+                          receiver.rankId,
+                          0,
+                          pme_pp->mpi_comm_mysim,
+                          &pme_pp->req[messages]);
+            }
             messages++;
         }
     }
index 1ec7104ef1db05a2f2881819b2f1947fd87c413c..8c29a1cf1bd2d8542fe482c8239ffad9ba644fdc 100644 (file)
@@ -91,24 +91,27 @@ void PmePpCommGpu::Impl::reinit(int size)
         MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
         RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_->data();
         MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
+        // Receive address of event and associated flag from PME rank, to allow sync to local stream after force transfer
+        MPI_Recv(&remotePmeForceSendEvent_, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        MPI_Recv(&remotePmeForceSendEventRecorded_, sizeof(std::atomic<bool>*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     }
 
 #endif
 }
 
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
 void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
 {
 #if GMX_MPI
-    // Remote PME task pushes GPU data directly data to this PP task.
+    // Wait until remote PME task has pushed data, and then enqueue remote event to local stream.
 
-    // Recieve event from PME task after PME->PP force data push has
-    // been scheduled and enqueue this to PP stream.
-    GpuEventSynchronizer* eventptr;
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Recv(&eventptr, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
-    eventptr->enqueueWaitEvent(pmePpCommStream_);
+    // Spin until PME rank sets flag
+    while (!(remotePmeForceSendEventRecorded_->load(std::memory_order_acquire))) {};
+
+    // Enqueue remote event
+    remotePmeForceSendEvent_->enqueueWaitEvent(pmePpCommStream_);
+
+    // Reset the flag
+    remotePmeForceSendEventRecorded_->store(false, std::memory_order_release);
 
     if (receivePmeForceToGpu)
     {
index 13b72cc66ae49582a4703740d7c923c44dab1df6..f4ec9214e8696398d93ca6763d484f92a3d9c29c 100644 (file)
@@ -43,6 +43,8 @@
 #ifndef GMX_PME_PP_COMM_GPU_IMPL_H
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
+#include <atomic>
+
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.h"
 #include "gromacs/math/vectypes.h"
@@ -176,6 +178,10 @@ private:
     GpuEventSynchronizer forcesReadySynchronizer_;
     //! Event recorded when coordinates have been transferred to PME task
     GpuEventSynchronizer pmeCoordinatesSynchronizer_;
+    //! Event recorded by remote PME task when forces have been transferred
+    GpuEventSynchronizer* remotePmeForceSendEvent_;
+    //! Flag to track when remote PP event has been recorded, ready for enqueueing
+    volatile std::atomic<bool>* remotePmeForceSendEventRecorded_;
 };
 
 } // namespace gmx