Avoid MPI sync for PME force sender GPU scheduling code and thread API calls

author Alan Gray <alangray3@gmail.com>

Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)

committer Andrey Alekseenko <al42and@gmail.com>

Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
author Alan Gray <alangray3@gmail.com>
Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
committer Andrey Alekseenko <al42and@gmail.com>
Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index e037679cec188cd38457229ffd16f39bc194b272..066763f8f007bb144b241787b2786f2024a52142 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -64,6 +64,7 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
      ppRanks_(ppRanks),
      ppCommStream_(ppRanks.size()),
      ppCommEvent_(ppRanks.size()),
+    ppCommEventRecorded_(ppRanks.size()),
      deviceContext_(deviceContext),
      pmeRemoteCpuForcePtr_(ppRanks.size()),
      pmeRemoteGpuForcePtr_(ppRanks.size())
@@ -113,6 +114,15 @@ void PmeForceSenderGpu::Impl::setForceSendBuffer(DeviceBuffer<Float3> d_f)
          MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
          // NOLINTNEXTLINE(bugprone-sizeof-expression)
          MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+        // Send address of event and associated flag to PP rank, to allow remote enqueueing
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&ppCommEvent_[i], sizeof(GpuEventSynchronizer*), MPI_BYTE, receiver.rankId, 0, comm_);
+
+        std::atomic<bool>* tmpPpCommEventRecordedPtr =
+                reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[i]));
+        tmpPpCommEventRecordedPtr->store(false, std::memory_order_release);
+        // NOLINTNEXTLINE(bugprone-sizeof-expression)
+        MPI_Send(&tmpPpCommEventRecordedPtr, sizeof(std::atomic<bool>*), MPI_BYTE, receiver.rankId, 0, comm_);
          i++;
      }
  
@@ -142,8 +152,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool
                                         ppCommStream_[ppRank]->stream());
      CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
      ppCommEvent_[ppRank]->markEvent(*ppCommStream_[ppRank]);
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Send(&ppCommEvent_[ppRank], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+    std::atomic<bool>* tmpPpCommEventRecordedPtr =
+            reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[ppRank]));
+    tmpPpCommEventRecordedPtr->store(true, std::memory_order_release);
  #else
      GMX_UNUSED_VALUE(ppRank);
      GMX_UNUSED_VALUE(numAtoms);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 5575f03b071e5e888ceb441bb3f6bb787987957e..6517d82bb0dde049b1d030a5e1010b4ff89c1fd0 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -43,11 +43,21 @@
  #ifndef GMX_PMEFORCESENDERGPU_IMPL_H
  #define GMX_PMEFORCESENDERGPU_IMPL_H
  
+#include <atomic>
+#include <new>
+
  #include "gromacs/ewald/pme_force_sender_gpu.h"
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gputraits.h"
  #include "gromacs/utility/arrayref.h"
  
+// Portable definition of cache line size
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
  class GpuEventSynchronizer;
  
  namespace gmx
@@ -55,6 +65,11 @@ namespace gmx
  
  /*! \internal \brief Class with interfaces and data for CUDA version of PME Force sending functionality*/
  
+typedef struct CacheLineAlignedFlag
+{
+    alignas(hardware_destructive_interference_size) bool flag;
+} CacheLineAlignedFlag;
+
  class PmeForceSenderGpu::Impl
  {
  
@@ -107,6 +122,8 @@ private:
      std::vector<std::unique_ptr<DeviceStream>> ppCommStream_;
      //! Events used for manging sync with remote PP ranks
      std::vector<std::unique_ptr<GpuEventSynchronizer>> ppCommEvent_;
+    //! Vector of flags to track when PP transfer events have been recorded
+    std::vector<std::atomic<CacheLineAlignedFlag>> ppCommEventRecorded_;
      //! Addresses of local force buffers to send to remote PP ranks
      std::vector<DeviceBuffer<RVec>> localForcePtr_;
      //! GPU context handle (not used in CUDA)
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 8e33845218145d6a0b85ff6138c275301cc3bb02..3151ec1ca719e906edeae6acc140d6a03f73b51a 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -101,6 +101,7 @@
  #include "gromacs/utility/smalloc.h"
  
  #include "pme_gpu_internal.h"
+#include "pme_internal.h"
  #include "pme_output.h"
  #include "pme_pp_communication.h"
  
@@ -546,46 +547,54 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
      int                    messages, ind_start, ind_end;
      cve.cycles = cycles;
  
-    /* Now the evaluated forces have to be transferred to the PP nodes */
+    if (pme_pp->useGpuDirectComm)
+    {
+        GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+                   "The use of GPU direct communication for PME-PP is enabled, "
+                   "but the PME GPU force reciever object does not exist");
+    }
+
      messages = 0;
      ind_end  = 0;
-    for (const auto& receiver : pme_pp->ppRanks)
+
+    /* Now the evaluated forces have to be transferred to the PP ranks */
+    if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
      {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
-        if (pme_pp->useGpuDirectComm)
+        int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+#    pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+        for (int i = 0; i < numPpRanks; i++)
          {
-            GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
-                       "The use of GPU direct communication for PME-PP is enabled, "
-                       "but the PME GPU force reciever object does not exist");
-
-            if (GMX_THREAD_MPI)
-            {
-                pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
-                        receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
-            }
-            else
+            auto& receiver = pme_pp->ppRanks[i];
+            pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                    receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
+        }
+    }
+    else
+    {
+        for (const auto& receiver : pme_pp->ppRanks)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
+            if (pme_pp->useGpuDirectComm)
              {
                  pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
                                                              ind_start,
                                                              receiver.numAtoms * sizeof(rvec),
                                                              receiver.rankId,
                                                              &pme_pp->req[messages]);
-
-                messages++;
              }
-        }
-        else
-        {
-            void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
-            // Send using MPI
-            MPI_Isend(sendbuf,
-                      receiver.numAtoms * sizeof(rvec),
-                      MPI_BYTE,
-                      receiver.rankId,
-                      0,
-                      pme_pp->mpi_comm_mysim,
-                      &pme_pp->req[messages]);
+            else
+            {
+                void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+                // Send using MPI
+                MPI_Isend(sendbuf,
+                          receiver.numAtoms * sizeof(rvec),
+                          MPI_BYTE,
+                          receiver.rankId,
+                          0,
+                          pme_pp->mpi_comm_mysim,
+                          &pme_pp->req[messages]);
+            }
              messages++;
          }
      }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 1ec7104ef1db05a2f2881819b2f1947fd87c413c..8c29a1cf1bd2d8542fe482c8239ffad9ba644fdc 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -91,24 +91,27 @@ void PmePpCommGpu::Impl::reinit(int size)
          MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
          RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_->data();
          MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
+        // Receive address of event and associated flag from PME rank, to allow sync to local stream after force transfer
+        MPI_Recv(&remotePmeForceSendEvent_, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        MPI_Recv(&remotePmeForceSendEventRecorded_, sizeof(std::atomic<bool>*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
      }
  
  #endif
  }
  
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
  void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
  {
  #if GMX_MPI
-    // Remote PME task pushes GPU data directly data to this PP task.
+    // Wait until remote PME task has pushed data, and then enqueue remote event to local stream.
  
-    // Recieve event from PME task after PME->PP force data push has
-    // been scheduled and enqueue this to PP stream.
-    GpuEventSynchronizer* eventptr;
-    // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Recv(&eventptr, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
-    eventptr->enqueueWaitEvent(pmePpCommStream_);
+    // Spin until PME rank sets flag
+    while (!(remotePmeForceSendEventRecorded_->load(std::memory_order_acquire))) {};
+
+    // Enqueue remote event
+    remotePmeForceSendEvent_->enqueueWaitEvent(pmePpCommStream_);
+
+    // Reset the flag
+    remotePmeForceSendEventRecorded_->store(false, std::memory_order_release);
  
      if (receivePmeForceToGpu)
      {
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 13b72cc66ae49582a4703740d7c923c44dab1df6..f4ec9214e8696398d93ca6763d484f92a3d9c29c 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -43,6 +43,8 @@
  #ifndef GMX_PME_PP_COMM_GPU_IMPL_H
  #define GMX_PME_PP_COMM_GPU_IMPL_H
  
+#include <atomic>
+
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.h"
  #include "gromacs/math/vectypes.h"
@@ -176,6 +178,10 @@ private:
      GpuEventSynchronizer forcesReadySynchronizer_;
      //! Event recorded when coordinates have been transferred to PME task
      GpuEventSynchronizer pmeCoordinatesSynchronizer_;
+    //! Event recorded by remote PME task when forces have been transferred
+    GpuEventSynchronizer* remotePmeForceSendEvent_;
+    //! Flag to track when remote PP event has been recorded, ready for enqueueing
+    volatile std::atomic<bool>* remotePmeForceSendEventRecorded_;
  };
  
  } // namespace gmx
author	Alan Gray <alangray3@gmail.com>
	Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
committer	Andrey Alekseenko <al42and@gmail.com>
	Wed, 29 Sep 2021 14:10:58 +0000 (14:10 +0000)
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history