Remove thread-MPI limitation for GPU direct PME-PP communication

author Gaurav Garg <gaugarg@nvidia.com>

Wed, 14 Apr 2021 05:50:07 +0000 (11:20 +0530)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 16 Apr 2021 13:10:08 +0000 (13:10 +0000)
author Gaurav Garg <gaugarg@nvidia.com>
Wed, 14 Apr 2021 05:50:07 +0000 (11:20 +0530)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 16 Apr 2021 13:10:08 +0000 (13:10 +0000)
diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst

index c9666bf24665d1384cf6ac5a23d6435120afc2b3..27859f51f7d955ce66740c64a07ec2a7db4acd2d 100644 (file)
--- a/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@ -223,6 +223,32 @@ and add ``-DGMX_MPI=on`` to the cmake options. It is possible to set
  the compiler to the MPI compiler wrapper but it is neither necessary
  nor recommended.
  
+CUDA-Aware MPI support
+~~~~~~~~~~~~~~~~~~~~~~
+
+In simulations using multiple NVIDIA GPUs, an MPI implementation with CUDA support
+(also called "CUDA-aware") allows communication to be performed directly between the
+distinct GPU memory spaces without staging through CPU memory, often
+resulting in higher bandwidth and lower latency communication.  For
+more details, see `Introduction to CUDA-aware MPI
+<https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/>`_.
+
+To use CUDA-aware MPI for direct GPU communication we recommend
+using the latest OpenMPI version (>=4.1.0) with the latest UCX version
+(>=1.10), since most GROMACS internal testing on CUDA-aware support has 
+been performed using these versions. OpenMPI with CUDA-aware support can 
+be built following the procedure in `these OpenMPI build instructions
+<https://www.open-mpi.org/faq/?category=buildcuda>`_.
+
+With ``GPU_MPI=ON``, GROMACS attempts to automatically detect CUDA support
+in the underlying MPI library at compile time, and enables direct GPU 
+communication when this is detected.  However, there are some cases when
+GROMACS may fail to detect existing CUDA-aware support, in which case
+it can be manually enabled by setting environment variable ``GMX_FORCE_CUDA_AWARE_MPI=1``
+at runtime (although such cases still lack substantial
+testing, so we urge the user to carefully check correctness of results
+against those using default build options, and report any issues).
+
  CMake
  ^^^^^
  
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h

index 94aefe8501a1d15d082a04dabf33c023e7767d90..81f640df409a4110f8c71796e598114e06a2dac4 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -83,9 +83,19 @@ public:
      void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
  
      /*! \brief
-     * enqueue wait for coordinate data from PP ranks
+     * Used for lib MPI, receives co-ordinates from PP ranks
+     * \param[in] recvbuf   coordinates buffer in GPU memory
+     * \param[in] numAtoms  starting element in buffer
+     * \param[in] numBytes  number of bytes to transfer
+     * \param[in] ppRank    PP rank to send data
       */
-    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+    /*! \brief
+     * For lib MPI, wait for coordinates from PP ranks
+     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     */
+    void synchronizeOnCoordinatesFromPpRanks();
  
  private:
      class Impl;
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp

index ae68eedca2e7ad8aab63a47ab494284cf9a2c196..4e997d319bd42cd32e9d32879178a99f3ca4a5fb 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -89,7 +89,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in
                 "implementation.");
  }
  
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> /* recvbuf */,
+                                                                     int /* numAtoms */,
+                                                                     int /* numBytes */,
+                                                                     int /* ppRank */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index 10f48f1d979085e75572f4bf91ed8ed228acb971..7fa2122dfb8a1cc5a65f03084e9356de114fc7a9 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -43,6 +43,7 @@
   */
  #include "gmxpre.h"
  
+#include "gromacs/ewald/pme_pp_communication.h"
  #include "pme_coordinate_receiver_gpu_impl.h"
  
  #include "config.h"
@@ -62,9 +63,6 @@ PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
      comm_(comm),
      ppRanks_(ppRanks)
  {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
      request_.resize(ppRanks.size());
      ppSync_.resize(ppRanks.size());
  }
@@ -73,28 +71,34 @@ PmeCoordinateReceiverGpu::Impl::~Impl() = default;
  
  void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
  {
-
-    int ind_start = 0;
-    int ind_end   = 0;
-    for (const auto& receiver : ppRanks_)
+    // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+    if (GMX_THREAD_MPI)
      {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
-
-        // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+        int ind_start = 0;
+        int ind_end   = 0;
+        for (const auto& receiver : ppRanks_)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
  
+            // Data will be transferred directly from GPU.
+            void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
  #if GMX_MPI
-        MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
  #else
-        GMX_UNUSED_VALUE(sendBuf);
+            GMX_UNUSED_VALUE(sendBuf);
  #endif
+        }
      }
  }
  
  /*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
  void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
  {
+    GMX_ASSERT(GMX_THREAD_MPI,
+               "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for "
+               "Thread-MPI");
+
      // Data will be pushed directly from PP task
  
  #if GMX_MPI
@@ -106,18 +110,44 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
  #endif
  }
  
-void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+/*! \brief Receive coordinate data using CUDA-aware MPI */
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                           int numAtoms,
+                                                                           int numBytes,
+                                                                           int ppRank)
+{
+    GMX_ASSERT(GMX_LIB_MPI,
+               "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+#else
+    GMX_UNUSED_VALUE(recvbuf);
+    GMX_UNUSED_VALUE(numAtoms);
+    GMX_UNUSED_VALUE(numBytes);
+    GMX_UNUSED_VALUE(ppRank);
+#endif
+}
+
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
  {
      if (recvCount_ > 0)
      {
-        // ensure PME calculation doesn't commence until coordinate data has been transferred
+        // ensure PME calculation doesn't commence until coordinate data/remote events
+        // has been transferred
  #if GMX_MPI
          MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
  #endif
-        for (int i = 0; i < recvCount_; i++)
+
+        // Make PME stream wait on PP to PME data trasnfer events
+        if (GMX_THREAD_MPI)
          {
-            ppSync_[i]->enqueueWaitEvent(pmeStream_);
+            for (int i = 0; i < recvCount_; i++)
+            {
+                ppSync_[i]->enqueueWaitEvent(pmeStream_);
+            }
          }
+
          // reset receive counter
          recvCount_ = 0;
      }
@@ -142,9 +172,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in
      impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
  }
  
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                     int                numAtoms,
+                                                                     int                numBytes,
+                                                                     int                ppRank)
+{
+    impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
  {
-    impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    impl_->synchronizeOnCoordinatesFromPpRanks();
  }
  
  } // namespace gmx
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h

index 0f4ca21fa35aed98fcb4b3f6049ae79df0425e95..604079c0b0fbcead4bebf03ae2a99eea132c5962 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -79,9 +79,19 @@ public:
      void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
  
      /*! \brief
-     * enqueue wait for coordinate data from PP ranks
+     * Used for lib MPI, receives co-ordinates from PP ranks
+     * \param[in] recvbuf   coordinates buffer in GPU memory
+     * \param[in] numAtoms  starting element in buffer
+     * \param[in] numBytes  number of bytes to transfer
+     * \param[in] ppRank    PP rank to send data
       */
-    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+    /*! \brief
+     * For lib MPI, wait for coordinates from PP ranks
+     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     */
+    void synchronizeOnCoordinatesFromPpRanks();
  
  private:
      //! CUDA stream for PME operations
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h

index edced615046b956b804c0b776316a20b84013d60..e06f582ae8c948ab8f22bccae677ddbf5f40d923 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -87,11 +87,21 @@ public:
      void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
  
      /*! \brief
-     * Send force synchronizer to PP rank
+     * Send force synchronizer to PP rank (used with Thread-MPI)
       * \param[in] ppRank           PP rank to receive data
       */
      void sendFSynchronizerToPpCudaDirect(int ppRank);
  
+    /*! \brief
+     * Send force to PP rank (used with Lib-MPI)
+     * \param[in] sendbuf  force buffer in GPU memory
+     * \param[in] offset   starting element in buffer
+     * \param[in] numBytes number of bytes to transfer
+     * \param[in] ppRank   PP rank to receive data
+     * \param[in] request  MPI request to track asynchronous MPI call status
+     */
+    void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
  private:
      class Impl;
      std::unique_ptr<Impl> impl_;
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp

index 0365a5563c74d777b24d129fa5329d49fc9c9564..f7cd9c5cc47c1e6e1e9f964d34e07723b1f2bab3 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -90,6 +90,17 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int /* ppRank */)
                 "implementation.");
  }
  
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> /* sendbuf */,
+                                         int /* offset */,
+                                         int /* numBytes */,
+                                         int /* ppRank */,
+                                         MPI_Request* /* request */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
  } // namespace gmx
  
  #endif // !GMX_GPU_CUDA
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index 753ac483f595fa2c542ef55f756e5a59c5265101..b124c03136e19cf66ab0f2ca729566ead93546a6 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -62,9 +62,6 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
      comm_(comm),
      ppRanks_(ppRanks)
  {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
  }
  
  PmeForceSenderGpu::Impl::~Impl() = default;
@@ -72,6 +69,13 @@ PmeForceSenderGpu::Impl::~Impl() = default;
  /*! \brief  sends force buffer address to PP ranks */
  void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f)
  {
+    // Need to send address to PP rank only for thread-MPI as PP rank pulls
+    // data using cudamemcpy
+    if (!GMX_THREAD_MPI)
+    {
+        return;
+    }
+#if GMX_MPI
      int ind_start = 0;
      int ind_end   = 0;
      for (const auto& receiver : ppRanks_)
@@ -80,30 +84,56 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float
          ind_end   = ind_start + receiver.numAtoms;
  
          // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_f[ind_start]);
+        Float3* sendBuf = &d_f[ind_start];
  
-#if GMX_MPI
-        MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+        MPI_Send(&sendBuf, sizeof(Float3*), MPI_BYTE, receiver.rankId, 0, comm_);
+    }
  #else
-        GMX_UNUSED_VALUE(sendBuf);
+    GMX_UNUSED_VALUE(d_f);
  #endif
-    }
  }
  
  /*! \brief Send PME synchronizer directly using CUDA memory copy */
  void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
  {
+    GMX_ASSERT(GMX_THREAD_MPI,
+               "sendFSynchronizerToPpCudaDirect is expected to be called only for Thread-MPI");
+
      // Data will be pulled directly from PP task
  #if GMX_MPI
      // TODO Using MPI_Isend would be more efficient, particularly when
      // sending to multiple PP ranks
      MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
  #else
-    GMX_UNUSED_VALUE(pmeSyncPtr);
      GMX_UNUSED_VALUE(ppRank);
  #endif
  }
  
+/*! \brief Send PME data directly using CUDA-aware MPI */
+void PmeForceSenderGpu::Impl::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+                                               int                offset,
+                                               int                numBytes,
+                                               int                ppRank,
+                                               MPI_Request*       request)
+{
+    GMX_ASSERT(GMX_LIB_MPI, "sendFToPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+    // if using GPU direct comm with CUDA-aware MPI, make sure forces are ready on device
+    // before sending it to PP ranks
+    pmeForcesReady_->waitForEvent();
+
+    MPI_Isend(sendbuf[offset], numBytes, MPI_BYTE, ppRank, 0, comm_, request);
+
+#else
+    GMX_UNUSED_VALUE(sendbuf);
+    GMX_UNUSED_VALUE(offset);
+    GMX_UNUSED_VALUE(numBytes);
+    GMX_UNUSED_VALUE(ppRank);
+    GMX_UNUSED_VALUE(request);
+#endif
+}
+
  PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
                                       MPI_Comm               comm,
                                       gmx::ArrayRef<PpRanks> ppRanks) :
@@ -123,5 +153,14 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int ppRank)
      impl_->sendFSynchronizerToPpCudaDirect(ppRank);
  }
  
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+                                         int                offset,
+                                         int                numBytes,
+                                         int                ppRank,
+                                         MPI_Request*       request)
+{
+    impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
+}
+
  
  } // namespace gmx
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 9ff0a15a9ad0c92ae47907c20e7579c0fcb9ea0e..0e0ad8122cbdccf5ce6bcd3bbd1d73570f1ae700 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -74,11 +74,21 @@ public:
      void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
  
      /*! \brief
-     * Send force synchronizer to PP rank
+     * Send force synchronizer to PP rank (used with Thread-MPI)
       * \param[in] ppRank           PP rank to receive data
       */
      void sendFSynchronizerToPpCudaDirect(int ppRank);
  
+    /*! \brief
+     * Send force to PP rank (used with Lib-MPI)
+     * \param[in] sendbuf  force buffer in GPU memory
+     * \param[in] offset   starting element in buffer
+     * \param[in] numBytes number of bytes to transfer
+     * \param[in] ppRank   PP rank to receive data
+     * \param[in] request  MPI request to track asynchronous MPI call status
+     */
+    void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
  private:
      //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
      GpuEventSynchronizer* pmeForcesReady_;
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 5130034a9fb3c06b2fc1a1ba565c91ac2e743674..64f685ab44a61090250d34e98340f0fed5428669 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -104,10 +104,6 @@
  #include "pme_output.h"
  #include "pme_pp_communication.h"
  
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuPmePpComms =
-        GMX_GPU_CUDA && GMX_THREAD_MPI && (getenv("GMX_GPU_PME_PP_COMMS") != nullptr);
-
  /*! \brief Master PP-PME communication data structure */
  struct gmx_pme_pp
  {
@@ -466,8 +462,16 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                  {
                      if (pme_pp->useGpuDirectComm)
                      {
-                        pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
-                                sender.rankId);
+                        if (GMX_THREAD_MPI)
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
+                                    sender.rankId);
+                        }
+                        else
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi(
+                                    stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId);
+                        }
                      }
                      else
                      {
@@ -493,7 +497,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
  
              if (pme_pp->useGpuDirectComm)
              {
-                pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+                pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
              }
  
              status = pmerecvqxX;
@@ -531,7 +535,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
  }
  
  /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
-static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
+static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
+                                        gmx_pme_pp*      pme_pp,
                                          const PmeOutput& output,
                                          real             dvdlambda_q,
                                          real             dvdlambda_lj,
@@ -547,18 +552,32 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
      ind_end  = 0;
      for (const auto& receiver : pme_pp->ppRanks)
      {
-        ind_start     = ind_end;
-        ind_end       = ind_start + receiver.numAtoms;
-        void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+        ind_start = ind_end;
+        ind_end   = ind_start + receiver.numAtoms;
          if (pme_pp->useGpuDirectComm)
          {
              GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
                         "The use of GPU direct communication for PME-PP is enabled, "
                         "but the PME GPU force reciever object does not exist");
-            pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+
+            if (GMX_THREAD_MPI)
+            {
+                pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+            }
+            else
+            {
+                pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
+                                                            ind_start,
+                                                            receiver.numAtoms * sizeof(rvec),
+                                                            receiver.rankId,
+                                                            &pme_pp->req[messages]);
+
+                messages++;
+            }
          }
          else
          {
+            void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
              // Send using MPI
              MPI_Isend(sendbuf,
                        receiver.numAtoms * sizeof(rvec),
@@ -593,6 +612,7 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
      MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
  #else
      GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
+    GMX_UNUSED_VALUE(pme);
      GMX_UNUSED_VALUE(pme_pp);
      GMX_UNUSED_VALUE(output);
      GMX_UNUSED_VALUE(dvdlambda_q);
@@ -608,6 +628,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                  gmx_walltime_accounting_t       walltime_accounting,
                  t_inputrec*                     ir,
                  PmeRunMode                      runMode,
+                bool                            useGpuPmePpCommunication,
                  const gmx::DeviceStreamManager* deviceStreamManager)
  {
      int     ret;
@@ -640,7 +661,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                             "Device stream can not be nullptr when using GPU in PME-only rank");
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
-        if (c_enableGpuPmePpComms)
+        if (useGpuPmePpCommunication)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
                      deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
@@ -780,7 +801,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
          }
  
          cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
-        gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
+        gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
  
          count++;
      } /***** end of quasi-loop, we stop with the break above */
diff --git a/src/gromacs/ewald/pme_only.h b/src/gromacs/ewald/pme_only.h

index 1a71ea195c1b74ad44d168bd20753cfb0706e4d4..150a5dad550439b534ce36b8f4c69334f8dda7d6 100644 (file)
--- a/src/gromacs/ewald/pme_only.h
+++ b/src/gromacs/ewald/pme_only.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -69,6 +69,7 @@ int gmx_pmeonly(gmx_pme_t*                      pme,
                  gmx_walltime_accounting_t       walltime_accounting,
                  t_inputrec*                     ir,
                  PmeRunMode                      runMode,
+                bool                            useGpuPmePpCommunication,
                  const gmx::DeviceStreamManager* deviceStreamManager);
  
  #endif
diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp

index 63693ed6a631eefa32d71151cc46d928b5cb89ec..11ca5faf720ce072b7c85911a94d9204bf20cd72 100644 (file)
--- a/src/gromacs/ewald/pme_pp.cpp
+++ b/src/gromacs/ewald/pme_pp.cpp
@@ -265,11 +265,16 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*         fr,
              real* xRealPtr = const_cast<real*>(x[0]);
              if (useGpuPmePpComms && (fr != nullptr))
              {
-                void* sendPtr = sendCoordinatesFromGpu
-                                        ? static_cast<void*>(fr->stateGpu->getCoordinates())
-                                        : static_cast<void*>(xRealPtr);
-                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(
-                        sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+                if (sendCoordinatesFromGpu)
+                {
+                    fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
+                            fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
+                }
+                else
+                {
+                    fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
+                            reinterpret_cast<gmx::RVec*>(xRealPtr), n, coordinatesReadyOnDeviceEvent);
+                }
              }
              else
              {
@@ -509,8 +514,8 @@ static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
      if (useGpuPmePpComms)
      {
          GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
-        // Receive directly using CUDA memory copy
-        pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
+        // Receive forces from PME rank
+        pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
      }
      else
      {
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h

index 886e0c221b8f8a456b99ef4e40fcea45070da3e9..1e4e614cd99920f7a50af358ed7c963dff8ee96b 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -45,6 +45,7 @@
  #include <memory>
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceContext;
@@ -84,18 +85,25 @@ public:
       * \param[in]  recvSize Number of elements to receive
       * \param[in] recvPmeForceToGpu Whether receive is to GPU, otherwise CPU
       */
-    void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool recvPmeForceToGpu);
+    void receiveForceFromPme(RVec* recvPtr, int recvSize, bool recvPmeForceToGpu);
  
      /*! \brief Push coordinates buffer directly to GPU memory on PME task
       * \param[in] sendPtr Buffer with coordinate data
       * \param[in] sendSize Number of elements to send
-     * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
       * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
       */
-    void sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                        int                   sendSize,
-                                        bool                  sendPmeCoordinatesFromGpu,
-                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    void sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec>    sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+    /*! \brief Push coordinates buffer from host memory directly to GPU memory on PME task
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeFromCpu(RVec*                 sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
      /*! \brief
       * Return pointer to buffer used for staging PME force on GPU
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp

index d31b976c10f5a9de691420b747c26d9c0574d8ec..e7339f9c7ea49f03821aa1653ada6ecd6ec9df6e 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -83,19 +83,25 @@ void PmePpCommGpu::reinit(int /* size */)
                 "correct implementation.");
  }
  
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* /* recvPtr */,
-                                                 int /* recvSize */,
-                                                 bool /* receivePmeForceToGpu */)
+void PmePpCommGpu::receiveForceFromPme(RVec* /* recvPtr */, int /* recvSize */, bool /* receivePmeForceToGpu */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
                 "implementation.");
  }
  
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
-                                                  int /* sendSize */,
-                                                  bool /* sendPmeCoordinatesFromGpu */,
-                                                  GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec> /* sendPtr */,
+                                               int /* sendSize */,
+                                               GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* /* sendPtr */,
+                                               int /* sendSize */,
+                                               GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index cb9e787c446836d27549d3e1c6775017520bc07f..2e242a074f91676abbc75da860a49c96754f5cb1 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -43,6 +43,7 @@
   */
  #include "gmxpre.h"
  
+#include "gromacs/ewald/pme_pp_communication.h"
  #include "pme_pp_comm_gpu_impl.h"
  
  #include "config.h"
@@ -52,6 +53,7 @@
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
  #include "gromacs/utility/gmxmpi.h"
  
  namespace gmx
@@ -67,9 +69,6 @@ PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
      pmeRank_(pmeRank),
      d_pmeForces_(nullptr)
  {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
  }
  
  PmePpCommGpu::Impl::~Impl() = default;
@@ -78,20 +77,22 @@ void PmePpCommGpu::Impl::reinit(int size)
  {
      // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
  #if GMX_MPI
-    MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
-    MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+
+    if (GMX_THREAD_MPI)
+    {
+        // receive device buffer address from PME rank
+        MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        MPI_Recv(&remotePmeFBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+    }
+
+#endif
  
      // Reallocate buffer used for staging PME force on GPU
      reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
-#else
-    GMX_UNUSED_VALUE(size);
-#endif
      return;
  }
  
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
  {
  #if GMX_MPI
      // Receive event from PME task and add to stream, to ensure pull of data doesn't
@@ -99,10 +100,10 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
      GpuEventSynchronizer* pmeSync;
      MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
      pmeSync->enqueueWaitEvent(pmePpCommStream_);
+#endif
  
      // Pull force data from remote GPU
-    void*       pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
-    cudaError_t stat        = cudaMemcpyAsync(pmeForcePtr,
+    cudaError_t stat = cudaMemcpyAsync(pmeForcePtr,
                                         remotePmeFBuffer_,
                                         recvSize * DIM * sizeof(float),
                                         cudaMemcpyDefault,
@@ -120,21 +121,37 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
      {
          // Ensure CPU waits for PME forces to be copied before reducing
          // them with other forces on the CPU
-        cudaStreamSynchronize(pmePpCommStream_.stream());
+        pmePpCommStream_.synchronize();
      }
+}
+
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaMpi(float3* pmeForcePtr, int recvSize)
+{
+#if GMX_MPI
+    MPI_Recv(pmeForcePtr, recvSize * DIM, MPI_FLOAT, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
  #else
-    GMX_UNUSED_VALUE(recvPtr);
+    GMX_UNUSED_VALUE(pmeForcePtr);
      GMX_UNUSED_VALUE(recvSize);
-    GMX_UNUSED_VALUE(receivePmeForceToGpu);
  #endif
  }
  
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
-                                                        int   sendSize,
-                                                        bool gmx_unused sendPmeCoordinatesFromGpu,
+void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+    float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
+    if (GMX_THREAD_MPI)
+    {
+        receiveForceFromPmeCudaDirect(pmeForcePtr, recvSize, receivePmeForceToGpu);
+    }
+    else
+    {
+        receiveForceFromPmeCudaMpi(pmeForcePtr, recvSize);
+    }
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3*               sendPtr,
+                                                        int                   sendSize,
                                                          GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
-#if GMX_MPI
      // ensure stream waits until coordinate data is available on device
      coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
  
@@ -145,18 +162,44 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
                                         pmePpCommStream_.stream());
      CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
  
+#if GMX_MPI
      // Record and send event to allow PME task to sync to above transfer before commencing force calculations
      pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
      GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
      MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
+#endif
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaMpi(float3*               sendPtr,
+                                                     int                   sendSize,
+                                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    // ensure coordinate data is available on device before we start transfer
+    coordinatesReadyOnDeviceEvent->waitForEvent();
+
+#if GMX_MPI
+    float3* sendptr_x = sendPtr;
+
+    MPI_Send(sendptr_x, sendSize * DIM, MPI_FLOAT, pmeRank_, eCommType_COORD_GPU, comm_);
  #else
      GMX_UNUSED_VALUE(sendPtr);
      GMX_UNUSED_VALUE(sendSize);
-    GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu);
-    GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
  #endif
  }
  
+void PmePpCommGpu::Impl::sendCoordinatesToPme(float3*               sendPtr,
+                                              int                   sendSize,
+                                              GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    if (GMX_THREAD_MPI)
+    {
+        sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+    }
+    else
+    {
+        sendCoordinatesToPmeCudaMpi(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+    }
+}
  DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
  {
      return d_pmeForces_;
@@ -164,7 +207,14 @@ DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
  
  GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
  {
-    return &forcesReadySynchronizer_;
+    if (GMX_THREAD_MPI)
+    {
+        return &forcesReadySynchronizer_;
+    }
+    else
+    {
+        return nullptr;
+    }
  }
  
  PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
@@ -182,21 +232,26 @@ void PmePpCommGpu::reinit(int size)
      impl_->reinit(size);
  }
  
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::receiveForceFromPme(RVec* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+    impl_->receiveForceFromPme(asFloat3(recvPtr), recvSize, receivePmeForceToGpu);
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec>    sendPtr,
+                                               int                   sendSize,
+                                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
-    impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
+    impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
  }
  
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                                  int                   sendSize,
-                                                  bool                  sendPmeCoordinatesFromGpu,
-                                                  GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec*                 sendPtr,
+                                               int                   sendSize,
+                                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
-    impl_->sendCoordinatesToPmeCudaDirect(
-            sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+    impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
  }
  
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+DeviceBuffer<Float3> PmePpCommGpu::getGpuForceStagingPtr()
  {
      return impl_->getGpuForceStagingPtr();
  }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 70ef8f937c3476b20e91e2d048fb759d4b36f9fb..d4ee85872e9aa5c3c1a6372574b1aaaf8313ecfc 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -44,9 +44,7 @@
  #define GMX_PME_PP_COMM_GPU_IMPL_H
  
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#include "gromacs/gpu_utils/gputraits.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/gmxmpi.h"
  
@@ -75,7 +73,7 @@ public:
  
      /*! \brief Pull force buffer directly from GPU memory on PME
       * rank to either GPU or CPU memory on PP task using CUDA
-     * Memory copy.
+     * Memory copy or CUDA-aware MPI.
       *
       * recvPtr should be in GPU or CPU memory if recvPmeForceToGpu
       * is true or false, respectively. If receiving to GPU, this
@@ -89,25 +87,20 @@ public:
       * \param[in] recvSize Number of elements to receive
       * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
       */
-    void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu);
+    void receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
  
  
      /*! \brief Push coordinates buffer directly to GPU memory on PME
       * task, from either GPU or CPU memory on PP task using CUDA
-     * Memory copy. sendPtr should be in GPU or CPU memory if
-     * sendPmeCoordinatesFromGpu is true or false respectively. If
-     * sending from GPU, this method should be called after the
-     * local GPU coordinate buffer operations. The remote PME task will
-     * automatically wait for data to be copied before commencing PME force calculations.
+     * Memory copy or CUDA-aware MPI. If sending from GPU, this method should
+     * be called after the local GPU coordinate buffer operations.
+     * The remote PME task will automatically wait for data to be copied
+     * before commencing PME force calculations.
       * \param[in] sendPtr Buffer with coordinate data
       * \param[in] sendSize Number of elements to send
-     * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
       * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
       */
-    void sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                        int                   sendSize,
-                                        bool                  sendPmeCoordinatesFromGpu,
-                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    void sendCoordinatesToPme(float3* sendPtr, int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
      /*! \brief
       * Return pointer to buffer used for staging PME force on GPU
@@ -119,15 +112,55 @@ public:
       */
      GpuEventSynchronizer* getForcesReadySynchronizer();
  
+private:
+    /*! \brief Pull force buffer directly from GPU memory on PME
+     * rank to either GPU or CPU memory on PP task using CUDA
+     * Memory copy. This method is used with Thread-MPI.
+     * \param[out] recvPtr CPU buffer to receive PME force data
+     * \param[in] recvSize Number of elements to receive
+     * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
+     */
+    void receiveForceFromPmeCudaDirect(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
+
+    /*! \brief Pull force buffer directly from GPU memory on PME
+     * rank to either GPU or CPU memory on PP task using CUDA-aware
+     * MPI. This method is used with process-MPI.
+     * \param[out] recvPtr CPU buffer to receive PME force data
+     * \param[in] recvSize Number of elements to receive
+     */
+    void receiveForceFromPmeCudaMpi(float3* recvPtr, int recvSize);
+
+    /*! \brief Push coordinates buffer directly to GPU memory on PME
+     * task, from either GPU or CPU memory on PP task using CUDA Memory copy.
+     * This method is used with Thread-MPI.
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeCudaDirect(float3*               sendPtr,
+                                        int                   sendSize,
+                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+    /*! \brief Push coordinates buffer directly to GPU memory on PME
+     * task, from either GPU or CPU memory on PP task using CUDA-aware MPI.
+     * This method is used with process-MPI.
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeCudaMpi(float3*               sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
  private:
      //! GPU context handle (not used in CUDA)
      const DeviceContext& deviceContext_;
      //! Handle for CUDA stream used for the communication operations in this class
      const DeviceStream& pmePpCommStream_;
      //! Remote location of PME coordinate data buffer
-    void* remotePmeXBuffer_ = nullptr;
+    float3* remotePmeXBuffer_ = nullptr;
      //! Remote location of PME force data buffer
-    void* remotePmeFBuffer_ = nullptr;
+    float3* remotePmeFBuffer_ = nullptr;
      //! communicator for simulation
      MPI_Comm comm_;
      //! Rank of PME task
diff --git a/src/gromacs/ewald/pme_pp_communication.h b/src/gromacs/ewald/pme_pp_communication.h

index 329284d4e4404e88feebf06a1d7636d8727c866c..9b6c5a9ce99494c7d3f6779e5cab8bec024d1c4c 100644 (file)
--- a/src/gromacs/ewald/pme_pp_communication.h
+++ b/src/gromacs/ewald/pme_pp_communication.h
@@ -4,7 +4,7 @@
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
   * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -62,6 +62,7 @@ enum
      eCommType_SigmaB,
      eCommType_NR,
      eCommType_COORD,
+    eCommType_COORD_GPU,
      eCommType_CNB
  };
  
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 45fbace426dd5d59b788f85f29ace0233511cabc..e53d3c16c597eefc47527768dc427ee669a1986c 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -209,14 +209,13 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
              GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
      devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
      devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
-    devFlags.enableGpuPmePPComm =
-            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+    devFlags.enableGpuPmePPComm = GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
  
  #pragma GCC diagnostic pop
  
      // Direct GPU comm path is being used with CUDA_AWARE_MPI
      // make sure underlying MPI implementation is CUDA-aware
-    if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange)
+    if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
      {
          const bool haveDetectedCudaAwareMpi =
                  (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
@@ -241,7 +240,9 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
              devFlags.usingCudaAwareMpi = true;
              GMX_LOG(mdlog.warning)
                      .asParagraph()
-                    .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature.");
+                    .appendTextFormatted(
+                            "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+                            "communications' feature.");
          }
          else
          {
@@ -255,6 +256,17 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
                                  "detect CUDA_aware support in underlying MPI implementation.");
                  devFlags.enableGpuHaloExchange = false;
              }
+            if (devFlags.enableGpuPmePPComm)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendText(
+                                "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+                                "'GPU PME-PP communications' feature will not be enabled as "
+                                "GROMACS couldn't "
+                                "detect CUDA_aware support in underlying MPI implementation.");
+                devFlags.enableGpuPmePPComm = false;
+            }
  
              GMX_LOG(mdlog.warning)
                      .asParagraph()
@@ -2037,6 +2049,7 @@ int Mdrunner::mdrunner()
                      walltime_accounting,
                      inputrec.get(),
                      pmeRunMode,
+                    runScheduleWork.simulationWork.useGpuPmePpCommunication,
                      deviceStreamManager.get());
      }
author	Gaurav Garg <gaugarg@nvidia.com>
	Wed, 14 Apr 2021 05:50:07 +0000 (11:20 +0530)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 16 Apr 2021 13:10:08 +0000 (13:10 +0000)
docs/install-guide/index.rst		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_only.h		patch \| blob \| history
src/gromacs/ewald/pme_pp.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_communication.h		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history