Remove thread-MPI limitation for GPU direct PME-PP communication
authorGaurav Garg <gaugarg@nvidia.com>
Wed, 14 Apr 2021 05:50:07 +0000 (11:20 +0530)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 16 Apr 2021 13:10:08 +0000 (13:10 +0000)
Allows use of direct-GPU communication for PP-PME communication when
running with "real" MPI, including on multiple compute nodes,
through new CUDA-aware MPI communication code paths.

Implements part of #2891
Refs: #2915 #3960

18 files changed:
docs/install-guide/index.rst
src/gromacs/ewald/pme_coordinate_receiver_gpu.h
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
src/gromacs/ewald/pme_force_sender_gpu.h
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_only.h
src/gromacs/ewald/pme_pp.cpp
src/gromacs/ewald/pme_pp_comm_gpu.h
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h
src/gromacs/ewald/pme_pp_communication.h
src/gromacs/mdrun/runner.cpp

index c9666bf24665d1384cf6ac5a23d6435120afc2b3..27859f51f7d955ce66740c64a07ec2a7db4acd2d 100644 (file)
@@ -223,6 +223,32 @@ and add ``-DGMX_MPI=on`` to the cmake options. It is possible to set
 the compiler to the MPI compiler wrapper but it is neither necessary
 nor recommended.
 
 the compiler to the MPI compiler wrapper but it is neither necessary
 nor recommended.
 
+CUDA-Aware MPI support
+~~~~~~~~~~~~~~~~~~~~~~
+
+In simulations using multiple NVIDIA GPUs, an MPI implementation with CUDA support
+(also called "CUDA-aware") allows communication to be performed directly between the
+distinct GPU memory spaces without staging through CPU memory, often
+resulting in higher bandwidth and lower latency communication.  For
+more details, see `Introduction to CUDA-aware MPI
+<https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/>`_.
+
+To use CUDA-aware MPI for direct GPU communication we recommend
+using the latest OpenMPI version (>=4.1.0) with the latest UCX version
+(>=1.10), since most GROMACS internal testing on CUDA-aware support has 
+been performed using these versions. OpenMPI with CUDA-aware support can 
+be built following the procedure in `these OpenMPI build instructions
+<https://www.open-mpi.org/faq/?category=buildcuda>`_.
+
+With ``GPU_MPI=ON``, GROMACS attempts to automatically detect CUDA support
+in the underlying MPI library at compile time, and enables direct GPU 
+communication when this is detected.  However, there are some cases when
+GROMACS may fail to detect existing CUDA-aware support, in which case
+it can be manually enabled by setting environment variable ``GMX_FORCE_CUDA_AWARE_MPI=1``
+at runtime (although such cases still lack substantial
+testing, so we urge the user to carefully check correctness of results
+against those using default build options, and report any issues).
+
 CMake
 ^^^^^
 
 CMake
 ^^^^^
 
index 94aefe8501a1d15d082a04dabf33c023e7767d90..81f640df409a4110f8c71796e598114e06a2dac4 100644 (file)
@@ -83,9 +83,19 @@ public:
     void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
 
     /*! \brief
     void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
 
     /*! \brief
-     * enqueue wait for coordinate data from PP ranks
+     * Used for lib MPI, receives co-ordinates from PP ranks
+     * \param[in] recvbuf   coordinates buffer in GPU memory
+     * \param[in] numAtoms  starting element in buffer
+     * \param[in] numBytes  number of bytes to transfer
+     * \param[in] ppRank    PP rank to send data
      */
      */
-    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+    /*! \brief
+     * For lib MPI, wait for coordinates from PP ranks
+     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     */
+    void synchronizeOnCoordinatesFromPpRanks();
 
 private:
     class Impl;
 
 private:
     class Impl;
index ae68eedca2e7ad8aab63a47ab494284cf9a2c196..4e997d319bd42cd32e9d32879178a99f3ca4a5fb 100644 (file)
@@ -89,7 +89,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in
                "implementation.");
 }
 
                "implementation.");
 }
 
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> /* recvbuf */,
+                                                                     int /* numAtoms */,
+                                                                     int /* numBytes */,
+                                                                     int /* ppRank */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
index 10f48f1d979085e75572f4bf91ed8ed228acb971..7fa2122dfb8a1cc5a65f03084e9356de114fc7a9 100644 (file)
@@ -43,6 +43,7 @@
  */
 #include "gmxpre.h"
 
  */
 #include "gmxpre.h"
 
+#include "gromacs/ewald/pme_pp_communication.h"
 #include "pme_coordinate_receiver_gpu_impl.h"
 
 #include "config.h"
 #include "pme_coordinate_receiver_gpu_impl.h"
 
 #include "config.h"
@@ -62,9 +63,6 @@ PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
     comm_(comm),
     ppRanks_(ppRanks)
 {
     comm_(comm),
     ppRanks_(ppRanks)
 {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
     request_.resize(ppRanks.size());
     ppSync_.resize(ppRanks.size());
 }
     request_.resize(ppRanks.size());
     ppSync_.resize(ppRanks.size());
 }
@@ -73,28 +71,34 @@ PmeCoordinateReceiverGpu::Impl::~Impl() = default;
 
 void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
 
 void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
-
-    int ind_start = 0;
-    int ind_end   = 0;
-    for (const auto& receiver : ppRanks_)
+    // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+    if (GMX_THREAD_MPI)
     {
     {
-        ind_start = ind_end;
-        ind_end   = ind_start + receiver.numAtoms;
-
-        // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+        int ind_start = 0;
+        int ind_end   = 0;
+        for (const auto& receiver : ppRanks_)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
 
 
+            // Data will be transferred directly from GPU.
+            void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
 #if GMX_MPI
 #if GMX_MPI
-        MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
 #else
 #else
-        GMX_UNUSED_VALUE(sendBuf);
+            GMX_UNUSED_VALUE(sendBuf);
 #endif
 #endif
+        }
     }
 }
 
 /*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
 void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
 {
     }
 }
 
 /*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
 void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
 {
+    GMX_ASSERT(GMX_THREAD_MPI,
+               "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for "
+               "Thread-MPI");
+
     // Data will be pushed directly from PP task
 
 #if GMX_MPI
     // Data will be pushed directly from PP task
 
 #if GMX_MPI
@@ -106,18 +110,44 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
 #endif
 }
 
 #endif
 }
 
-void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+/*! \brief Receive coordinate data using CUDA-aware MPI */
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                           int numAtoms,
+                                                                           int numBytes,
+                                                                           int ppRank)
+{
+    GMX_ASSERT(GMX_LIB_MPI,
+               "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+#else
+    GMX_UNUSED_VALUE(recvbuf);
+    GMX_UNUSED_VALUE(numAtoms);
+    GMX_UNUSED_VALUE(numBytes);
+    GMX_UNUSED_VALUE(ppRank);
+#endif
+}
+
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
 {
     if (recvCount_ > 0)
     {
 {
     if (recvCount_ > 0)
     {
-        // ensure PME calculation doesn't commence until coordinate data has been transferred
+        // ensure PME calculation doesn't commence until coordinate data/remote events
+        // has been transferred
 #if GMX_MPI
         MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
 #endif
 #if GMX_MPI
         MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
 #endif
-        for (int i = 0; i < recvCount_; i++)
+
+        // Make PME stream wait on PP to PME data trasnfer events
+        if (GMX_THREAD_MPI)
         {
         {
-            ppSync_[i]->enqueueWaitEvent(pmeStream_);
+            for (int i = 0; i < recvCount_; i++)
+            {
+                ppSync_[i]->enqueueWaitEvent(pmeStream_);
+            }
         }
         }
+
         // reset receive counter
         recvCount_ = 0;
     }
         // reset receive counter
         recvCount_ = 0;
     }
@@ -142,9 +172,17 @@ void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(in
     impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
 }
 
     impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
 }
 
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+                                                                     int                numAtoms,
+                                                                     int                numBytes,
+                                                                     int                ppRank)
+{
+    impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
 {
 {
-    impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    impl_->synchronizeOnCoordinatesFromPpRanks();
 }
 
 } // namespace gmx
 }
 
 } // namespace gmx
index 0f4ca21fa35aed98fcb4b3f6049ae79df0425e95..604079c0b0fbcead4bebf03ae2a99eea132c5962 100644 (file)
@@ -79,9 +79,19 @@ public:
     void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
 
     /*! \brief
     void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
 
     /*! \brief
-     * enqueue wait for coordinate data from PP ranks
+     * Used for lib MPI, receives co-ordinates from PP ranks
+     * \param[in] recvbuf   coordinates buffer in GPU memory
+     * \param[in] numAtoms  starting element in buffer
+     * \param[in] numBytes  number of bytes to transfer
+     * \param[in] ppRank    PP rank to send data
      */
      */
-    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+    /*! \brief
+     * For lib MPI, wait for coordinates from PP ranks
+     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     */
+    void synchronizeOnCoordinatesFromPpRanks();
 
 private:
     //! CUDA stream for PME operations
 
 private:
     //! CUDA stream for PME operations
index edced615046b956b804c0b776316a20b84013d60..e06f582ae8c948ab8f22bccae677ddbf5f40d923 100644 (file)
@@ -87,11 +87,21 @@ public:
     void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
 
     /*! \brief
     void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
 
     /*! \brief
-     * Send force synchronizer to PP rank
+     * Send force synchronizer to PP rank (used with Thread-MPI)
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFSynchronizerToPpCudaDirect(int ppRank);
 
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFSynchronizerToPpCudaDirect(int ppRank);
 
+    /*! \brief
+     * Send force to PP rank (used with Lib-MPI)
+     * \param[in] sendbuf  force buffer in GPU memory
+     * \param[in] offset   starting element in buffer
+     * \param[in] numBytes number of bytes to transfer
+     * \param[in] ppRank   PP rank to receive data
+     * \param[in] request  MPI request to track asynchronous MPI call status
+     */
+    void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
 private:
     class Impl;
     std::unique_ptr<Impl> impl_;
 private:
     class Impl;
     std::unique_ptr<Impl> impl_;
index 0365a5563c74d777b24d129fa5329d49fc9c9564..f7cd9c5cc47c1e6e1e9f964d34e07723b1f2bab3 100644 (file)
@@ -90,6 +90,17 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int /* ppRank */)
                "implementation.");
 }
 
                "implementation.");
 }
 
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> /* sendbuf */,
+                                         int /* offset */,
+                                         int /* numBytes */,
+                                         int /* ppRank */,
+                                         MPI_Request* /* request */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
 } // namespace gmx
 
 #endif // !GMX_GPU_CUDA
 } // namespace gmx
 
 #endif // !GMX_GPU_CUDA
index 753ac483f595fa2c542ef55f756e5a59c5265101..b124c03136e19cf66ab0f2ca729566ead93546a6 100644 (file)
@@ -62,9 +62,6 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
     comm_(comm),
     ppRanks_(ppRanks)
 {
     comm_(comm),
     ppRanks_(ppRanks)
 {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
 }
 
 PmeForceSenderGpu::Impl::~Impl() = default;
 }
 
 PmeForceSenderGpu::Impl::~Impl() = default;
@@ -72,6 +69,13 @@ PmeForceSenderGpu::Impl::~Impl() = default;
 /*! \brief  sends force buffer address to PP ranks */
 void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f)
 {
 /*! \brief  sends force buffer address to PP ranks */
 void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f)
 {
+    // Need to send address to PP rank only for thread-MPI as PP rank pulls
+    // data using cudamemcpy
+    if (!GMX_THREAD_MPI)
+    {
+        return;
+    }
+#if GMX_MPI
     int ind_start = 0;
     int ind_end   = 0;
     for (const auto& receiver : ppRanks_)
     int ind_start = 0;
     int ind_end   = 0;
     for (const auto& receiver : ppRanks_)
@@ -80,30 +84,56 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float
         ind_end   = ind_start + receiver.numAtoms;
 
         // Data will be transferred directly from GPU.
         ind_end   = ind_start + receiver.numAtoms;
 
         // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_f[ind_start]);
+        Float3* sendBuf = &d_f[ind_start];
 
 
-#if GMX_MPI
-        MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+        MPI_Send(&sendBuf, sizeof(Float3*), MPI_BYTE, receiver.rankId, 0, comm_);
+    }
 #else
 #else
-        GMX_UNUSED_VALUE(sendBuf);
+    GMX_UNUSED_VALUE(d_f);
 #endif
 #endif
-    }
 }
 
 /*! \brief Send PME synchronizer directly using CUDA memory copy */
 void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
 {
 }
 
 /*! \brief Send PME synchronizer directly using CUDA memory copy */
 void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
 {
+    GMX_ASSERT(GMX_THREAD_MPI,
+               "sendFSynchronizerToPpCudaDirect is expected to be called only for Thread-MPI");
+
     // Data will be pulled directly from PP task
 #if GMX_MPI
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
     MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
 #else
     // Data will be pulled directly from PP task
 #if GMX_MPI
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
     MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
 #else
-    GMX_UNUSED_VALUE(pmeSyncPtr);
     GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
     GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
+/*! \brief Send PME data directly using CUDA-aware MPI */
+void PmeForceSenderGpu::Impl::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+                                               int                offset,
+                                               int                numBytes,
+                                               int                ppRank,
+                                               MPI_Request*       request)
+{
+    GMX_ASSERT(GMX_LIB_MPI, "sendFToPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+    // if using GPU direct comm with CUDA-aware MPI, make sure forces are ready on device
+    // before sending it to PP ranks
+    pmeForcesReady_->waitForEvent();
+
+    MPI_Isend(sendbuf[offset], numBytes, MPI_BYTE, ppRank, 0, comm_, request);
+
+#else
+    GMX_UNUSED_VALUE(sendbuf);
+    GMX_UNUSED_VALUE(offset);
+    GMX_UNUSED_VALUE(numBytes);
+    GMX_UNUSED_VALUE(ppRank);
+    GMX_UNUSED_VALUE(request);
+#endif
+}
+
 PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
                                      MPI_Comm               comm,
                                      gmx::ArrayRef<PpRanks> ppRanks) :
 PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
                                      MPI_Comm               comm,
                                      gmx::ArrayRef<PpRanks> ppRanks) :
@@ -123,5 +153,14 @@ void PmeForceSenderGpu::sendFSynchronizerToPpCudaDirect(int ppRank)
     impl_->sendFSynchronizerToPpCudaDirect(ppRank);
 }
 
     impl_->sendFSynchronizerToPpCudaDirect(ppRank);
 }
 
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+                                         int                offset,
+                                         int                numBytes,
+                                         int                ppRank,
+                                         MPI_Request*       request)
+{
+    impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
+}
+
 
 } // namespace gmx
 
 } // namespace gmx
index 9ff0a15a9ad0c92ae47907c20e7579c0fcb9ea0e..0e0ad8122cbdccf5ce6bcd3bbd1d73570f1ae700 100644 (file)
@@ -74,11 +74,21 @@ public:
     void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
 
     /*! \brief
     void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
 
     /*! \brief
-     * Send force synchronizer to PP rank
+     * Send force synchronizer to PP rank (used with Thread-MPI)
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFSynchronizerToPpCudaDirect(int ppRank);
 
      * \param[in] ppRank           PP rank to receive data
      */
     void sendFSynchronizerToPpCudaDirect(int ppRank);
 
+    /*! \brief
+     * Send force to PP rank (used with Lib-MPI)
+     * \param[in] sendbuf  force buffer in GPU memory
+     * \param[in] offset   starting element in buffer
+     * \param[in] numBytes number of bytes to transfer
+     * \param[in] ppRank   PP rank to receive data
+     * \param[in] request  MPI request to track asynchronous MPI call status
+     */
+    void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
 private:
     //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
     GpuEventSynchronizer* pmeForcesReady_;
 private:
     //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
     GpuEventSynchronizer* pmeForcesReady_;
index 5130034a9fb3c06b2fc1a1ba565c91ac2e743674..64f685ab44a61090250d34e98340f0fed5428669 100644 (file)
 #include "pme_output.h"
 #include "pme_pp_communication.h"
 
 #include "pme_output.h"
 #include "pme_pp_communication.h"
 
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuPmePpComms =
-        GMX_GPU_CUDA && GMX_THREAD_MPI && (getenv("GMX_GPU_PME_PP_COMMS") != nullptr);
-
 /*! \brief Master PP-PME communication data structure */
 struct gmx_pme_pp
 {
 /*! \brief Master PP-PME communication data structure */
 struct gmx_pme_pp
 {
@@ -466,8 +462,16 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                 {
                     if (pme_pp->useGpuDirectComm)
                     {
                 {
                     if (pme_pp->useGpuDirectComm)
                     {
-                        pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
-                                sender.rankId);
+                        if (GMX_THREAD_MPI)
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
+                                    sender.rankId);
+                        }
+                        else
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi(
+                                    stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId);
+                        }
                     }
                     else
                     {
                     }
                     else
                     {
@@ -493,7 +497,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
 
             if (pme_pp->useGpuDirectComm)
             {
 
             if (pme_pp->useGpuDirectComm)
             {
-                pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+                pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
             }
 
             status = pmerecvqxX;
             }
 
             status = pmerecvqxX;
@@ -531,7 +535,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
 }
 
 /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
 }
 
 /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
-static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
+static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
+                                        gmx_pme_pp*      pme_pp,
                                         const PmeOutput& output,
                                         real             dvdlambda_q,
                                         real             dvdlambda_lj,
                                         const PmeOutput& output,
                                         real             dvdlambda_q,
                                         real             dvdlambda_lj,
@@ -547,18 +552,32 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
     ind_end  = 0;
     for (const auto& receiver : pme_pp->ppRanks)
     {
     ind_end  = 0;
     for (const auto& receiver : pme_pp->ppRanks)
     {
-        ind_start     = ind_end;
-        ind_end       = ind_start + receiver.numAtoms;
-        void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+        ind_start = ind_end;
+        ind_end   = ind_start + receiver.numAtoms;
         if (pme_pp->useGpuDirectComm)
         {
             GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
                        "The use of GPU direct communication for PME-PP is enabled, "
                        "but the PME GPU force reciever object does not exist");
         if (pme_pp->useGpuDirectComm)
         {
             GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
                        "The use of GPU direct communication for PME-PP is enabled, "
                        "but the PME GPU force reciever object does not exist");
-            pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+
+            if (GMX_THREAD_MPI)
+            {
+                pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+            }
+            else
+            {
+                pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
+                                                            ind_start,
+                                                            receiver.numAtoms * sizeof(rvec),
+                                                            receiver.rankId,
+                                                            &pme_pp->req[messages]);
+
+                messages++;
+            }
         }
         else
         {
         }
         else
         {
+            void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
             // Send using MPI
             MPI_Isend(sendbuf,
                       receiver.numAtoms * sizeof(rvec),
             // Send using MPI
             MPI_Isend(sendbuf,
                       receiver.numAtoms * sizeof(rvec),
@@ -593,6 +612,7 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp*      pme_pp,
     MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
 #else
     GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
     MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
 #else
     GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
+    GMX_UNUSED_VALUE(pme);
     GMX_UNUSED_VALUE(pme_pp);
     GMX_UNUSED_VALUE(output);
     GMX_UNUSED_VALUE(dvdlambda_q);
     GMX_UNUSED_VALUE(pme_pp);
     GMX_UNUSED_VALUE(output);
     GMX_UNUSED_VALUE(dvdlambda_q);
@@ -608,6 +628,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                 gmx_walltime_accounting_t       walltime_accounting,
                 t_inputrec*                     ir,
                 PmeRunMode                      runMode,
                 gmx_walltime_accounting_t       walltime_accounting,
                 t_inputrec*                     ir,
                 PmeRunMode                      runMode,
+                bool                            useGpuPmePpCommunication,
                 const gmx::DeviceStreamManager* deviceStreamManager)
 {
     int     ret;
                 const gmx::DeviceStreamManager* deviceStreamManager)
 {
     int     ret;
@@ -640,7 +661,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
                            "Device stream can not be nullptr when using GPU in PME-only rank");
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
                            "Device stream can not be nullptr when using GPU in PME-only rank");
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
-        if (c_enableGpuPmePpComms)
+        if (useGpuPmePpCommunication)
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
                     deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
                     deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
@@ -780,7 +801,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
         }
 
         cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
         }
 
         cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
-        gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
+        gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
 
         count++;
     } /***** end of quasi-loop, we stop with the break above */
 
         count++;
     } /***** end of quasi-loop, we stop with the break above */
index 1a71ea195c1b74ad44d168bd20753cfb0706e4d4..150a5dad550439b534ce36b8f4c69334f8dda7d6 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -69,6 +69,7 @@ int gmx_pmeonly(gmx_pme_t*                      pme,
                 gmx_walltime_accounting_t       walltime_accounting,
                 t_inputrec*                     ir,
                 PmeRunMode                      runMode,
                 gmx_walltime_accounting_t       walltime_accounting,
                 t_inputrec*                     ir,
                 PmeRunMode                      runMode,
+                bool                            useGpuPmePpCommunication,
                 const gmx::DeviceStreamManager* deviceStreamManager);
 
 #endif
                 const gmx::DeviceStreamManager* deviceStreamManager);
 
 #endif
index 63693ed6a631eefa32d71151cc46d928b5cb89ec..11ca5faf720ce072b7c85911a94d9204bf20cd72 100644 (file)
@@ -265,11 +265,16 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*         fr,
             real* xRealPtr = const_cast<real*>(x[0]);
             if (useGpuPmePpComms && (fr != nullptr))
             {
             real* xRealPtr = const_cast<real*>(x[0]);
             if (useGpuPmePpComms && (fr != nullptr))
             {
-                void* sendPtr = sendCoordinatesFromGpu
-                                        ? static_cast<void*>(fr->stateGpu->getCoordinates())
-                                        : static_cast<void*>(xRealPtr);
-                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(
-                        sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+                if (sendCoordinatesFromGpu)
+                {
+                    fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
+                            fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
+                }
+                else
+                {
+                    fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
+                            reinterpret_cast<gmx::RVec*>(xRealPtr), n, coordinatesReadyOnDeviceEvent);
+                }
             }
             else
             {
             }
             else
             {
@@ -509,8 +514,8 @@ static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
     if (useGpuPmePpComms)
     {
         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
     if (useGpuPmePpComms)
     {
         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
-        // Receive directly using CUDA memory copy
-        pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
+        // Receive forces from PME rank
+        pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
     }
     else
     {
     }
     else
     {
index 886e0c221b8f8a456b99ef4e40fcea45070da3e9..1e4e614cd99920f7a50af358ed7c963dff8ee96b 100644 (file)
@@ -45,6 +45,7 @@
 #include <memory>
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include <memory>
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
@@ -84,18 +85,25 @@ public:
      * \param[in]  recvSize Number of elements to receive
      * \param[in] recvPmeForceToGpu Whether receive is to GPU, otherwise CPU
      */
      * \param[in]  recvSize Number of elements to receive
      * \param[in] recvPmeForceToGpu Whether receive is to GPU, otherwise CPU
      */
-    void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool recvPmeForceToGpu);
+    void receiveForceFromPme(RVec* recvPtr, int recvSize, bool recvPmeForceToGpu);
 
     /*! \brief Push coordinates buffer directly to GPU memory on PME task
      * \param[in] sendPtr Buffer with coordinate data
      * \param[in] sendSize Number of elements to send
 
     /*! \brief Push coordinates buffer directly to GPU memory on PME task
      * \param[in] sendPtr Buffer with coordinate data
      * \param[in] sendSize Number of elements to send
-     * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
      * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
      */
      * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
      */
-    void sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                        int                   sendSize,
-                                        bool                  sendPmeCoordinatesFromGpu,
-                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    void sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec>    sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+    /*! \brief Push coordinates buffer from host memory directly to GPU memory on PME task
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeFromCpu(RVec*                 sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
 
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
index d31b976c10f5a9de691420b747c26d9c0574d8ec..e7339f9c7ea49f03821aa1653ada6ecd6ec9df6e 100644 (file)
@@ -83,19 +83,25 @@ void PmePpCommGpu::reinit(int /* size */)
                "correct implementation.");
 }
 
                "correct implementation.");
 }
 
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* /* recvPtr */,
-                                                 int /* recvSize */,
-                                                 bool /* receivePmeForceToGpu */)
+void PmePpCommGpu::receiveForceFromPme(RVec* /* recvPtr */, int /* recvSize */, bool /* receivePmeForceToGpu */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
 }
 
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
 }
 
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
-                                                  int /* sendSize */,
-                                                  bool /* sendPmeCoordinatesFromGpu */,
-                                                  GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec> /* sendPtr */,
+                                               int /* sendSize */,
+                                               GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* /* sendPtr */,
+                                               int /* sendSize */,
+                                               GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
index cb9e787c446836d27549d3e1c6775017520bc07f..2e242a074f91676abbc75da860a49c96754f5cb1 100644 (file)
@@ -43,6 +43,7 @@
  */
 #include "gmxpre.h"
 
  */
 #include "gmxpre.h"
 
+#include "gromacs/ewald/pme_pp_communication.h"
 #include "pme_pp_comm_gpu_impl.h"
 
 #include "config.h"
 #include "pme_pp_comm_gpu_impl.h"
 
 #include "config.h"
@@ -52,6 +53,7 @@
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/utility/gmxmpi.h"
 
 namespace gmx
 #include "gromacs/utility/gmxmpi.h"
 
 namespace gmx
@@ -67,9 +69,6 @@ PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
     pmeRank_(pmeRank),
     d_pmeForces_(nullptr)
 {
     pmeRank_(pmeRank),
     d_pmeForces_(nullptr)
 {
-    GMX_RELEASE_ASSERT(
-            GMX_THREAD_MPI,
-            "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
@@ -78,20 +77,22 @@ void PmePpCommGpu::Impl::reinit(int size)
 {
     // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
 #if GMX_MPI
 {
     // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
 #if GMX_MPI
-    MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
-    MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+
+    if (GMX_THREAD_MPI)
+    {
+        // receive device buffer address from PME rank
+        MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+        MPI_Recv(&remotePmeFBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+    }
+
+#endif
 
     // Reallocate buffer used for staging PME force on GPU
     reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
 
     // Reallocate buffer used for staging PME force on GPU
     reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
-#else
-    GMX_UNUSED_VALUE(size);
-#endif
     return;
 }
 
     return;
 }
 
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
 {
 #if GMX_MPI
     // Receive event from PME task and add to stream, to ensure pull of data doesn't
 {
 #if GMX_MPI
     // Receive event from PME task and add to stream, to ensure pull of data doesn't
@@ -99,10 +100,10 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
     GpuEventSynchronizer* pmeSync;
     MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     pmeSync->enqueueWaitEvent(pmePpCommStream_);
     GpuEventSynchronizer* pmeSync;
     MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     pmeSync->enqueueWaitEvent(pmePpCommStream_);
+#endif
 
     // Pull force data from remote GPU
 
     // Pull force data from remote GPU
-    void*       pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
-    cudaError_t stat        = cudaMemcpyAsync(pmeForcePtr,
+    cudaError_t stat = cudaMemcpyAsync(pmeForcePtr,
                                        remotePmeFBuffer_,
                                        recvSize * DIM * sizeof(float),
                                        cudaMemcpyDefault,
                                        remotePmeFBuffer_,
                                        recvSize * DIM * sizeof(float),
                                        cudaMemcpyDefault,
@@ -120,21 +121,37 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
     {
         // Ensure CPU waits for PME forces to be copied before reducing
         // them with other forces on the CPU
     {
         // Ensure CPU waits for PME forces to be copied before reducing
         // them with other forces on the CPU
-        cudaStreamSynchronize(pmePpCommStream_.stream());
+        pmePpCommStream_.synchronize();
     }
     }
+}
+
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaMpi(float3* pmeForcePtr, int recvSize)
+{
+#if GMX_MPI
+    MPI_Recv(pmeForcePtr, recvSize * DIM, MPI_FLOAT, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
 #else
 #else
-    GMX_UNUSED_VALUE(recvPtr);
+    GMX_UNUSED_VALUE(pmeForcePtr);
     GMX_UNUSED_VALUE(recvSize);
     GMX_UNUSED_VALUE(recvSize);
-    GMX_UNUSED_VALUE(receivePmeForceToGpu);
 #endif
 }
 
 #endif
 }
 
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
-                                                        int   sendSize,
-                                                        bool gmx_unused sendPmeCoordinatesFromGpu,
+void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+    float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
+    if (GMX_THREAD_MPI)
+    {
+        receiveForceFromPmeCudaDirect(pmeForcePtr, recvSize, receivePmeForceToGpu);
+    }
+    else
+    {
+        receiveForceFromPmeCudaMpi(pmeForcePtr, recvSize);
+    }
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3*               sendPtr,
+                                                        int                   sendSize,
                                                         GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
                                                         GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
-#if GMX_MPI
     // ensure stream waits until coordinate data is available on device
     coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
 
     // ensure stream waits until coordinate data is available on device
     coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
 
@@ -145,18 +162,44 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
                                        pmePpCommStream_.stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
 
                                        pmePpCommStream_.stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
 
+#if GMX_MPI
     // Record and send event to allow PME task to sync to above transfer before commencing force calculations
     pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
     GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
     MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
     // Record and send event to allow PME task to sync to above transfer before commencing force calculations
     pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
     GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
     MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
+#endif
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaMpi(float3*               sendPtr,
+                                                     int                   sendSize,
+                                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    // ensure coordinate data is available on device before we start transfer
+    coordinatesReadyOnDeviceEvent->waitForEvent();
+
+#if GMX_MPI
+    float3* sendptr_x = sendPtr;
+
+    MPI_Send(sendptr_x, sendSize * DIM, MPI_FLOAT, pmeRank_, eCommType_COORD_GPU, comm_);
 #else
     GMX_UNUSED_VALUE(sendPtr);
     GMX_UNUSED_VALUE(sendSize);
 #else
     GMX_UNUSED_VALUE(sendPtr);
     GMX_UNUSED_VALUE(sendSize);
-    GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu);
-    GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
 }
 
 #endif
 }
 
+void PmePpCommGpu::Impl::sendCoordinatesToPme(float3*               sendPtr,
+                                              int                   sendSize,
+                                              GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    if (GMX_THREAD_MPI)
+    {
+        sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+    }
+    else
+    {
+        sendCoordinatesToPmeCudaMpi(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+    }
+}
 DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
     return d_pmeForces_;
 DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
     return d_pmeForces_;
@@ -164,7 +207,14 @@ DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
 
 GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
 {
 
 GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
 {
-    return &forcesReadySynchronizer_;
+    if (GMX_THREAD_MPI)
+    {
+        return &forcesReadySynchronizer_;
+    }
+    else
+    {
+        return nullptr;
+    }
 }
 
 PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
 }
 
 PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
@@ -182,21 +232,26 @@ void PmePpCommGpu::reinit(int size)
     impl_->reinit(size);
 }
 
     impl_->reinit(size);
 }
 
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::receiveForceFromPme(RVec* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+    impl_->receiveForceFromPme(asFloat3(recvPtr), recvSize, receivePmeForceToGpu);
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec>    sendPtr,
+                                               int                   sendSize,
+                                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 {
-    impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
+    impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
 }
 
 }
 
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                                  int                   sendSize,
-                                                  bool                  sendPmeCoordinatesFromGpu,
-                                                  GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec*                 sendPtr,
+                                               int                   sendSize,
+                                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 {
-    impl_->sendCoordinatesToPmeCudaDirect(
-            sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+    impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
 }
 
 }
 
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+DeviceBuffer<Float3> PmePpCommGpu::getGpuForceStagingPtr()
 {
     return impl_->getGpuForceStagingPtr();
 }
 {
     return impl_->getGpuForceStagingPtr();
 }
index 70ef8f937c3476b20e91e2d048fb759d4b36f9fb..d4ee85872e9aa5c3c1a6372574b1aaaf8313ecfc 100644 (file)
@@ -44,9 +44,7 @@
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -75,7 +73,7 @@ public:
 
     /*! \brief Pull force buffer directly from GPU memory on PME
      * rank to either GPU or CPU memory on PP task using CUDA
 
     /*! \brief Pull force buffer directly from GPU memory on PME
      * rank to either GPU or CPU memory on PP task using CUDA
-     * Memory copy.
+     * Memory copy or CUDA-aware MPI.
      *
      * recvPtr should be in GPU or CPU memory if recvPmeForceToGpu
      * is true or false, respectively. If receiving to GPU, this
      *
      * recvPtr should be in GPU or CPU memory if recvPmeForceToGpu
      * is true or false, respectively. If receiving to GPU, this
@@ -89,25 +87,20 @@ public:
      * \param[in] recvSize Number of elements to receive
      * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
      */
      * \param[in] recvSize Number of elements to receive
      * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
      */
-    void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu);
+    void receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
 
 
     /*! \brief Push coordinates buffer directly to GPU memory on PME
      * task, from either GPU or CPU memory on PP task using CUDA
 
 
     /*! \brief Push coordinates buffer directly to GPU memory on PME
      * task, from either GPU or CPU memory on PP task using CUDA
-     * Memory copy. sendPtr should be in GPU or CPU memory if
-     * sendPmeCoordinatesFromGpu is true or false respectively. If
-     * sending from GPU, this method should be called after the
-     * local GPU coordinate buffer operations. The remote PME task will
-     * automatically wait for data to be copied before commencing PME force calculations.
+     * Memory copy or CUDA-aware MPI. If sending from GPU, this method should
+     * be called after the local GPU coordinate buffer operations.
+     * The remote PME task will automatically wait for data to be copied
+     * before commencing PME force calculations.
      * \param[in] sendPtr Buffer with coordinate data
      * \param[in] sendSize Number of elements to send
      * \param[in] sendPtr Buffer with coordinate data
      * \param[in] sendSize Number of elements to send
-     * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
      * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
      */
      * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
      */
-    void sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
-                                        int                   sendSize,
-                                        bool                  sendPmeCoordinatesFromGpu,
-                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    void sendCoordinatesToPme(float3* sendPtr, int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
 
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
@@ -119,15 +112,55 @@ public:
      */
     GpuEventSynchronizer* getForcesReadySynchronizer();
 
      */
     GpuEventSynchronizer* getForcesReadySynchronizer();
 
+private:
+    /*! \brief Pull force buffer directly from GPU memory on PME
+     * rank to either GPU or CPU memory on PP task using CUDA
+     * Memory copy. This method is used with Thread-MPI.
+     * \param[out] recvPtr CPU buffer to receive PME force data
+     * \param[in] recvSize Number of elements to receive
+     * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
+     */
+    void receiveForceFromPmeCudaDirect(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
+
+    /*! \brief Pull force buffer directly from GPU memory on PME
+     * rank to either GPU or CPU memory on PP task using CUDA-aware
+     * MPI. This method is used with process-MPI.
+     * \param[out] recvPtr CPU buffer to receive PME force data
+     * \param[in] recvSize Number of elements to receive
+     */
+    void receiveForceFromPmeCudaMpi(float3* recvPtr, int recvSize);
+
+    /*! \brief Push coordinates buffer directly to GPU memory on PME
+     * task, from either GPU or CPU memory on PP task using CUDA Memory copy.
+     * This method is used with Thread-MPI.
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeCudaDirect(float3*               sendPtr,
+                                        int                   sendSize,
+                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+    /*! \brief Push coordinates buffer directly to GPU memory on PME
+     * task, from either GPU or CPU memory on PP task using CUDA-aware MPI.
+     * This method is used with process-MPI.
+     * \param[in] sendPtr Buffer with coordinate data
+     * \param[in] sendSize Number of elements to send
+     * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+     */
+    void sendCoordinatesToPmeCudaMpi(float3*               sendPtr,
+                                     int                   sendSize,
+                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
 private:
     //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
     //! Handle for CUDA stream used for the communication operations in this class
     const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
 private:
     //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
     //! Handle for CUDA stream used for the communication operations in this class
     const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
-    void* remotePmeXBuffer_ = nullptr;
+    float3* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
     //! Remote location of PME force data buffer
-    void* remotePmeFBuffer_ = nullptr;
+    float3* remotePmeFBuffer_ = nullptr;
     //! communicator for simulation
     MPI_Comm comm_;
     //! Rank of PME task
     //! communicator for simulation
     MPI_Comm comm_;
     //! Rank of PME task
index 329284d4e4404e88feebf06a1d7636d8727c866c..9b6c5a9ce99494c7d3f6779e5cab8bec024d1c4c 100644 (file)
@@ -4,7 +4,7 @@
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -62,6 +62,7 @@ enum
     eCommType_SigmaB,
     eCommType_NR,
     eCommType_COORD,
     eCommType_SigmaB,
     eCommType_NR,
     eCommType_COORD,
+    eCommType_COORD_GPU,
     eCommType_CNB
 };
 
     eCommType_CNB
 };
 
index 45fbace426dd5d59b788f85f29ace0233511cabc..e53d3c16c597eefc47527768dc427ee669a1986c 100644 (file)
@@ -209,14 +209,13 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
             GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
     devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
     devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
             GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
     devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
     devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
-    devFlags.enableGpuPmePPComm =
-            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+    devFlags.enableGpuPmePPComm = GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
 
 #pragma GCC diagnostic pop
 
     // Direct GPU comm path is being used with CUDA_AWARE_MPI
     // make sure underlying MPI implementation is CUDA-aware
 
 #pragma GCC diagnostic pop
 
     // Direct GPU comm path is being used with CUDA_AWARE_MPI
     // make sure underlying MPI implementation is CUDA-aware
-    if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange)
+    if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
     {
         const bool haveDetectedCudaAwareMpi =
                 (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
     {
         const bool haveDetectedCudaAwareMpi =
                 (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
@@ -241,7 +240,9 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
             devFlags.usingCudaAwareMpi = true;
             GMX_LOG(mdlog.warning)
                     .asParagraph()
             devFlags.usingCudaAwareMpi = true;
             GMX_LOG(mdlog.warning)
                     .asParagraph()
-                    .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature.");
+                    .appendTextFormatted(
+                            "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+                            "communications' feature.");
         }
         else
         {
         }
         else
         {
@@ -255,6 +256,17 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
                                 "detect CUDA_aware support in underlying MPI implementation.");
                 devFlags.enableGpuHaloExchange = false;
             }
                                 "detect CUDA_aware support in underlying MPI implementation.");
                 devFlags.enableGpuHaloExchange = false;
             }
+            if (devFlags.enableGpuPmePPComm)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendText(
+                                "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+                                "'GPU PME-PP communications' feature will not be enabled as "
+                                "GROMACS couldn't "
+                                "detect CUDA_aware support in underlying MPI implementation.");
+                devFlags.enableGpuPmePPComm = false;
+            }
 
             GMX_LOG(mdlog.warning)
                     .asParagraph()
 
             GMX_LOG(mdlog.warning)
                     .asParagraph()
@@ -2037,6 +2049,7 @@ int Mdrunner::mdrunner()
                     walltime_accounting,
                     inputrec.get(),
                     pmeRunMode,
                     walltime_accounting,
                     inputrec.get(),
                     pmeRunMode,
+                    runScheduleWork.simulationWork.useGpuPmePpCommunication,
                     deviceStreamManager.get());
     }
 
                     deviceStreamManager.get());
     }