Pipeline GPU PME Spline/Spread with PP Comms

author Alan Gray <alangray3@gmail.com>

Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)
author Alan Gray <alangray3@gmail.com>
Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)
diff --git a/docs/release-notes/2022/major/performance.rst b/docs/release-notes/2022/major/performance.rst

index 580ef631a373f50839e0344d252424e58a974a0f..247715ff239945f794d9eaabebac97acbb9baaf1 100644 (file)
--- a/docs/release-notes/2022/major/performance.rst
+++ b/docs/release-notes/2022/major/performance.rst
@@ -26,3 +26,19 @@ up to a factor of 3.
  
  :issue:`2875`
  :issue:`742`
  
  :issue:`2875`
  :issue:`742`
+
+       
+PME-PP GPU Direct Communication Pipelining
+""""""""""""""""""""""""""""""""""""""
+
+For multi-GPU runs with direct PME-PP GPU comunication enabled, the
+PME rank can now pipeline the coordinate transfers with computation in
+the PME Spread and Spline kernel (where the coordinates are
+consumed). The data from each transfer is handled seperately, allowing
+computation and communication to be overlapped. This is expected to
+have most benefit on systems where hardware communication interfaces
+are shared between multiple GPUs, e.g. PCIe within multi-GPU servers
+or Infiniband across multiple nodes.
+
+:issue:`3969`
+
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index af0e258ae98fff341a239bfa03446f9a44f67ff5..d166b22653ee91c67344563212a8f969e6a88dc7 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 93145d4e1e9807b6aa92197e912eff175e9a2c6f..f8653e20dda06838fb2647cc432ca8f23eb8ae56 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -119,6 +119,7 @@ private:
      std::vector<std::string> reasons_;
  };
  
      std::vector<std::string> reasons_;
  };
  
+class PmeCoordinateReceiverGpu;
  } // namespace gmx
  
  enum
  } // namespace gmx
  
  enum
@@ -373,17 +374,24 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t*     GPU_FUNC_ARGU
  /*! \brief
   * Launches first stage of PME on GPU - spreading kernel.
   *
  /*! \brief
   * Launches first stage of PME on GPU - spreading kernel.
   *
- * \param[in] pme                The PME data structure.
- * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates
- * are ready in the device memory; nullptr allowed only on separate PME ranks.
- * \param[in] wcycle             The wallclock counter.
- * \param[in] lambdaQ            The Coulomb lambda of the current state of the
- * system. Only used if FEP of Coulomb is active.
+ * \param[in] pme                            The PME data structure.
+ * \param[in] xReadyOnDevice                 Event synchronizer indicating that the coordinates
+ *                                           are ready in the device memory; nullptr allowed only
+ *                                           on separate PME ranks.
+ * \param[in] wcycle                         The wallclock counter.
+ * \param[in] lambdaQ                        The Coulomb lambda of the current state of the
+ *                                           system. Only used if FEP of Coulomb is active.
+ * \param[in] useGpuDirectComm               Whether direct GPU PME-PP communication is active
+ * \param[in]  pmeCoordinateReceiverGpu      Coordinate receiver object, which must be valid when
+ *                                           direct GPU PME-PP communication is active
   */
   */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t*            GPU_FUNC_ARGUMENT(pme),
-                                              GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                              gmx_wallcycle*        GPU_FUNC_ARGUMENT(wcycle),
-                                              real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(
+        gmx_pme_t*                     GPU_FUNC_ARGUMENT(pme),
+        GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
+        gmx_wallcycle*                 GPU_FUNC_ARGUMENT(wcycle),
+        real                           GPU_FUNC_ARGUMENT(lambdaQ),
+        const bool                     GPU_FUNC_ARGUMENT(useGpuDirectComm),
+        gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h

index 81f640df409a4110f8c71796e598114e06a2dac4..d285c1995439c76a0982dcc420c881c4f4b114f7 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -49,6 +49,8 @@
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceStream;
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceStream;
+class DeviceContext;
+
  struct PpRanks;
  
  namespace gmx
  struct PpRanks;
  
  namespace gmx
@@ -62,18 +64,30 @@ class PmeCoordinateReceiverGpu
  
  public:
      /*! \brief Creates PME GPU coordinate receiver object
  
  public:
      /*! \brief Creates PME GPU coordinate receiver object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     *
+     * For multi-GPU runs, the PME GPU can receive coordinates from
+     * multiple PP GPUs. Data from these distinct communications can
+     * be handled separately in the PME spline/spread kernel, allowing
+     * pipelining which overlaps computation and communication. The
+     * class methods are designed to called seperately for each remote
+     * PP rank, and internally a different stream is used for each
+     * remote PP rank to allow overlapping.
+     *
       * \param[in] comm            Communicator used for simulation
       * \param[in] comm            Communicator used for simulation
+     * \param[in] deviceContext   GPU context
       * \param[in] ppRanks         List of PP ranks
       */
       * \param[in] ppRanks         List of PP ranks
       */
-    PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeCoordinateReceiverGpu(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<PpRanks> ppRanks);
      ~PmeCoordinateReceiverGpu();
  
      /*! \brief
      ~PmeCoordinateReceiverGpu();
  
      /*! \brief
+     * Re-initialize: set atom ranges and, for thread-MPI case,
       * send coordinates buffer address to PP rank
       * send coordinates buffer address to PP rank
+     * This is required after repartitioning since atom ranges and
+     * buffer allocations may have changed.
       * \param[in] d_x   coordinates buffer in GPU memory
       */
       * \param[in] d_x   coordinates buffer in GPU memory
       */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+    void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
  
  
      /*! \brief
  
  
      /*! \brief
@@ -92,10 +106,37 @@ public:
      void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
  
      /*! \brief
      void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
  
      /*! \brief
-     * For lib MPI, wait for coordinates from PP ranks
-     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     * For lib MPI, wait for coordinates from any PP rank
+     * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+     * rank determined from pipeline stage into given stream
+     * \param[in] pipelineStage  stage of pipeline corresponding to this transfer
+     * \param[in] deviceStream   stream in which to enqueue the wait event.
+     * \returns                  rank of sending PP task
+     */
+    int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+    /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+     * enqueueing all events to a single stream
+     * \param[in] deviceStream   stream in which to enqueue the wait events.
+     */
+    void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+    /*! \brief
+     * Return pointer to stream associated with specific PP rank sender index
+     * \param[in] senderIndex    Index of sender PP rank.
+     */
+    DeviceStream* ppCommStream(int senderIndex);
+
+    /*! \brief
+     * Returns range of atoms involved in communication associated with specific PP rank sender
+     * index \param[in] senderIndex    Index of sender PP rank.
+     */
+    std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+    /*! \brief
+     * Return number of PP ranks involved in PME-PP communication
       */
       */
-    void synchronizeOnCoordinatesFromPpRanks();
+    int ppCommNumSenderRanks();
  
  private:
      class Impl;
  
  private:
      class Impl;
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp

index 4e997d319bd42cd32e9d32879178a99f3ca4a5fb..9d727d14b696bbb9d69258ae101f77bae00f5079 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -62,8 +62,8 @@ class PmeCoordinateReceiverGpu::Impl
  };
  
  /*!\brief Constructor stub. */
  };
  
  /*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
-                                                   MPI_Comm /* comm */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm /* comm */,
+                                                   const DeviceContext& /* deviceContext */,
                                                     gmx::ArrayRef<PpRanks> /* ppRanks */) :
      impl_(nullptr)
  {
                                                     gmx::ArrayRef<PpRanks> /* ppRanks */) :
      impl_(nullptr)
  {
@@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStr
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
  /*!\brief init PME-PP GPU communication stub */
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
  /*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> /* d_x */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication initialization was called instead of the "
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication initialization was called instead of the "
@@ -99,13 +99,47 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe
                 "implementation.");
  }
  
                 "implementation.");
  }
  
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int /* pipelineStage */,
+                                                                 const DeviceStream& /* deviceStream */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
                 "implementation.");
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
                 "implementation.");
+    return 0;
  }
  
  }
  
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& /* deviceStream */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int /* senderIndex */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return nullptr;
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int /* senderIndex */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return std::make_tuple(0, 0);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+    return 0;
+}
+
+
  } // namespace gmx
  
  #endif // !GMX_GPU_CUDA
  } // namespace gmx
  
  #endif // !GMX_GPU_CUDA
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index 9de7c6676f53eea9b228e46a1562599b9d5001b9..5b1fa48adef94e3ae66b5269898606aaf3d899c3 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -56,33 +56,42 @@
  namespace gmx
  {
  
  namespace gmx
  {
  
-PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
-                                     MPI_Comm               comm,
-                                     gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks)
+PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm                     comm,
+                                     const DeviceContext&         deviceContext,
+                                     gmx::ArrayRef<const PpRanks> ppRanks) :
+    comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext)
  {
  {
-    request_.resize(ppRanks.size());
-    ppSync_.resize(ppRanks.size());
+    // Create streams to manage pipelining
+    ppCommManagers_.reserve(ppRanks.size());
+    for (auto& ppRank : ppRanks)
+    {
+        ppCommManagers_.emplace_back(PpCommManager{
+                ppRank,
+                std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false),
+                nullptr,
+                { 0, 0 } });
+    }
  }
  
  PmeCoordinateReceiverGpu::Impl::~Impl() = default;
  
  }
  
  PmeCoordinateReceiverGpu::Impl::~Impl() = default;
  
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
  {
  {
-    // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
-    if (GMX_THREAD_MPI)
+    int indEnd = 0;
+    for (auto& ppCommManager : ppCommManagers_)
      {
      {
-        int ind_start = 0;
-        int ind_end   = 0;
-        for (const auto& receiver : ppRanks_)
-        {
-            ind_start = ind_end;
-            ind_end   = ind_start + receiver.numAtoms;
+        int indStart = indEnd;
+        indEnd       = indStart + ppCommManager.ppRank.numAtoms;
  
  
+        ppCommManager.atomRange = std::make_tuple(indStart, indEnd);
+
+        // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+        if (GMX_THREAD_MPI)
+        {
              // Data will be transferred directly from GPU.
              // Data will be transferred directly from GPU.
-            void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+            void* sendBuf = reinterpret_cast<void*>(&d_x[indStart]);
  #if GMX_MPI
  #if GMX_MPI
-            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+            MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_);
  #else
              GMX_UNUSED_VALUE(sendBuf);
  #endif
  #else
              GMX_UNUSED_VALUE(sendBuf);
  #endif
@@ -102,8 +111,13 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
  #if GMX_MPI
      // Receive event from PP task
      // NOLINTNEXTLINE(bugprone-sizeof-expression)
  #if GMX_MPI
      // Receive event from PP task
      // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
-    recvCount_++;
+    MPI_Irecv(&ppCommManagers_[ppRank].sync,
+              sizeof(GpuEventSynchronizer*),
+              MPI_BYTE,
+              ppRank,
+              0,
+              comm_,
+              &(requests_[ppRank]));
  #else
      GMX_UNUSED_VALUE(ppRank);
  #endif
  #else
      GMX_UNUSED_VALUE(ppRank);
  #endif
@@ -119,7 +133,7 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic
                 "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
  
  #if GMX_MPI
                 "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
  
  #if GMX_MPI
-    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+    MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank]));
  #else
      GMX_UNUSED_VALUE(recvbuf);
      GMX_UNUSED_VALUE(numAtoms);
  #else
      GMX_UNUSED_VALUE(recvbuf);
      GMX_UNUSED_VALUE(numAtoms);
@@ -128,42 +142,70 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic
  #endif
  }
  
  #endif
  }
  
-void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage,
+                                                                       const DeviceStream& deviceStream)
  {
  {
-    if (recvCount_ > 0)
-    {
-        // ensure PME calculation doesn't commence until coordinate data/remote events
-        // has been transferred
  #if GMX_MPI
  #if GMX_MPI
-        MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
+    int senderRank = -1; // Rank of PP task that is associated with this invocation.
+#    if (!GMX_THREAD_MPI)
+    // Wait on data from any one of the PP sender GPUs
+    MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE);
+    GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater");
+    GMX_UNUSED_VALUE(pipelineStage);
+    GMX_UNUSED_VALUE(deviceStream);
+#    else
+    // MPI_Waitany is not available in thread-MPI. However, the
+    // MPI_Wait here is not associated with data but is host-side
+    // scheduling code to receive a CUDA event, and will be executed
+    // in advance of the actual data transfer. Therefore we can
+    // receive in order of pipeline stage, still allowing the
+    // scheduled GPU-direct comms to initiate out-of-order in their
+    // respective streams. For cases with CPU force computations, the
+    // scheduling is less asynchronous (done on a per-step basis), so
+    // host-side improvements should be investigated as tracked in
+    // issue #4047
+    senderRank = pipelineStage;
+    MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE);
+    ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream);
+#    endif
+    return senderRank;
  #endif
  #endif
+}
  
  
-        // Make PME stream wait on PP to PME data trasnfer events
-        if (GMX_THREAD_MPI)
-        {
-            for (int i = 0; i < recvCount_; i++)
-            {
-                ppSync_[i]->enqueueWaitEvent(pmeStream_);
-            }
-        }
-
-        // reset receive counter
-        recvCount_ = 0;
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    for (int i = 0; i < static_cast<int>(ppCommManagers_.size()); i++)
+    {
+        synchronizeOnCoordinatesFromPpRank(i, deviceStream);
      }
  }
      }
  }
+DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].stream.get();
+}
  
  
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream&    pmeStream,
-                                                   MPI_Comm               comm,
+std::tuple<int, int> PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex)
+{
+    return ppCommManagers_[senderIndex].atomRange;
+}
+
+int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks()
+{
+    return ppCommManagers_.size();
+}
+
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm               comm,
+                                                   const DeviceContext&   deviceContext,
                                                     gmx::ArrayRef<PpRanks> ppRanks) :
                                                     gmx::ArrayRef<PpRanks> ppRanks) :
-    impl_(new Impl(pmeStream, comm, ppRanks))
+    impl_(new Impl(comm, deviceContext, ppRanks))
  {
  }
  
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
  {
  }
  
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
  {
  {
-    impl_->sendCoordinateBufferAddressToPpRanks(d_x);
+    impl_->reinitCoordinateReceiver(d_x);
  }
  
  void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
  }
  
  void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
@@ -179,9 +221,31 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe
      impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
  }
  
      impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
  }
  
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int                 senderIndex,
+                                                                 const DeviceStream& deviceStream)
  {
  {
-    impl_->synchronizeOnCoordinatesFromPpRanks();
+    return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream);
  }
  
  }
  
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+    impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream);
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex)
+{
+    return impl_->ppCommStream(senderIndex);
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex)
+{
+    return impl_->ppCommAtomRange(senderIndex);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+    return impl_->ppCommNumSenderRanks();
+}
+
+
  } // namespace gmx
  } // namespace gmx
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h

index 604079c0b0fbcead4bebf03ae2a99eea132c5962..d268091771bab038053f1b3b4ad8031da141d880 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -52,25 +52,41 @@ class GpuEventSynchronizer;
  
  namespace gmx
  {
  
  namespace gmx
  {
-/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
  
  
+/*! \brief Object to manage communications with a specific PP rank */
+struct PpCommManager
+{
+    //! Details of PP rank that may be updated after repartitioning
+    const PpRanks& ppRank;
+    //! Stream used communication with for PP rank
+    std::unique_ptr<DeviceStream> stream;
+    //! Synchronization event to receive from PP rank
+    GpuEventSynchronizer* sync = nullptr;
+    //! Range of atoms corresponding to PP rank
+    std::tuple<int, int> atomRange = { 0, 0 };
+};
+
+/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
  class PmeCoordinateReceiverGpu::Impl
  {
  
  public:
      /*! \brief Creates PME GPU coordinate receiver object
  class PmeCoordinateReceiverGpu::Impl
  {
  
  public:
      /*! \brief Creates PME GPU coordinate receiver object
-     * \param[in] pmeStream       CUDA stream used for PME computations
       * \param[in] comm            Communicator used for simulation
       * \param[in] comm            Communicator used for simulation
+     * \param[in] deviceContext   GPU context
       * \param[in] ppRanks         List of PP ranks
       */
       * \param[in] ppRanks         List of PP ranks
       */
-    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<const PpRanks> ppRanks);
      ~Impl();
  
      /*! \brief
      ~Impl();
  
      /*! \brief
-     * send coordinates buffer address to PP rank
+     * Re-initialize: set atom ranges and, for thread-MPI case,
+     * send coordinates buffer address to PP rank.
+     * This is required after repartitioning since atom ranges and
+     * buffer allocations may have changed.
       * \param[in] d_x   coordinates buffer in GPU memory
       */
       * \param[in] d_x   coordinates buffer in GPU memory
       */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+    void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
  
      /*! \brief
       * Receive coordinate synchronizer pointer from the PP ranks.
  
      /*! \brief
       * Receive coordinate synchronizer pointer from the PP ranks.
@@ -88,24 +104,47 @@ public:
      void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
  
      /*! \brief
      void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
  
      /*! \brief
-     * For lib MPI, wait for coordinates from PP ranks
-     * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+     * For lib MPI, wait for coordinates from any PP rank
+     * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+     * rank determined from pipeline stage into given stream
+     * \param[in] pipelineStage  stage of pipeline corresponding to this transfer
+     * \param[in] deviceStream   stream in which to enqueue the wait event.
+     * \returns                  rank of sending PP task
+     */
+    int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+    /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+     * enqueueing all events to a single stream
+     * \param[in] deviceStream   stream in which to enqueue the wait events.
+     */
+    void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+    /*! \brief
+     * Return pointer to stream associated with specific PP rank sender index
+     * \param[in] senderIndex    Index of sender PP rank.
+     */
+    DeviceStream* ppCommStream(int senderIndex);
+
+    /*! \brief
+     * Returns range of atoms involved in communication associated with specific PP rank sender
+     * index \param[in] senderIndex    Index of sender PP rank.
+     */
+    std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+    /*! \brief
+     * Return number of PP ranks involved in PME-PP communication
       */
       */
-    void synchronizeOnCoordinatesFromPpRanks();
+    int ppCommNumSenderRanks();
  
  private:
  
  private:
-    //! CUDA stream for PME operations
-    const DeviceStream& pmeStream_;
      //! communicator for simulation
      MPI_Comm comm_;
      //! communicator for simulation
      MPI_Comm comm_;
-    //! list of PP ranks
-    gmx::ArrayRef<PpRanks> ppRanks_;
-    //! vector of MPI requests
-    std::vector<MPI_Request> request_;
-    //! vector of synchronization events to receive from PP tasks
-    std::vector<GpuEventSynchronizer*> ppSync_;
-    //! counter of messages to receive
-    int recvCount_ = 0;
+    //! MPI requests, one per PP rank
+    std::vector<MPI_Request> requests_;
+    //! GPU context handle (not used in CUDA)
+    const DeviceContext& deviceContext_;
+    //! Communication manager objects corresponding to multiple sending PP ranks
+    std::vector<PpCommManager> ppCommManagers_;
  };
  
  } // namespace gmx
  };
  
  } // namespace gmx
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 084bb3f6cc53f81355e63e7e225006b25304601c..680fcec0fc24d809eb17dca3f251b5e30a387aed 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -59,6 +59,7 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
  #include "pme_gpu_internal.h"
  #include "pme_gpu_settings.h"
  
  #include "pme_gpu_internal.h"
  #include "pme_gpu_settings.h"
@@ -189,10 +190,12 @@ void pme_gpu_prepare_computation(gmx_pme_t*               pme,
      }
  }
  
      }
  }
  
-void pme_gpu_launch_spread(gmx_pme_t*            pme,
-                           GpuEventSynchronizer* xReadyOnDevice,
-                           gmx_wallcycle*        wcycle,
-                           const real            lambdaQ)
+void pme_gpu_launch_spread(gmx_pme_t*                     pme,
+                           GpuEventSynchronizer*          xReadyOnDevice,
+                           gmx_wallcycle*                 wcycle,
+                           const real                     lambdaQ,
+                           const bool                     useGpuDirectComm,
+                           gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
      GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
      GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
@@ -215,7 +218,8 @@ void pme_gpu_launch_spread(gmx_pme_t*            pme,
      const bool spreadCharges  = true;
      wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
      wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
      const bool spreadCharges  = true;
      wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
      wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
-    pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
+    pme_gpu_spread(
+            pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
      wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
      wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
  }
      wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
      wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
  }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 96dc1f4db95a1b1953fca13a89f6c664a50e5e54..2f7da67a3a0524ec5a60e4761c7cbd361b733e0d 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -75,6 +75,7 @@
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
  #if GMX_GPU_CUDA
  #    include "pme.cuh"
  
  #if GMX_GPU_CUDA
  #    include "pme.cuh"
@@ -593,7 +594,10 @@ static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceCon
       */
  
  #if GMX_GPU_CUDA
       */
  
  #if GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+    pmeGpu->kernelParams->usePipeline       = false;
+    pmeGpu->kernelParams->pipelineAtomStart = 0;
+    pmeGpu->kernelParams->pipelineAtomEnd   = 0;
+    pmeGpu->maxGridWidthX                   = deviceContext.deviceInfo().prop.maxGridSize[0];
  #else
      // Use this path for any non-CUDA GPU acceleration
      // TODO: is there no really global work size limit in OpenCL?
  #else
      // Use this path for any non-CUDA GPU acceleration
      // TODO: is there no really global work size limit in OpenCL?
@@ -1276,12 +1280,14 @@ static auto selectSpreadKernelPtr(const PmeGpu*  pmeGpu,
      return kernelPtr;
  }
  
      return kernelPtr;
  }
  
-void pme_gpu_spread(const PmeGpu*         pmeGpu,
-                    GpuEventSynchronizer* xReadyOnDevice,
-                    real**                h_grids,
-                    bool                  computeSplines,
-                    bool                  spreadCharges,
-                    const real            lambda)
+void pme_gpu_spread(const PmeGpu*                  pmeGpu,
+                    GpuEventSynchronizer*          xReadyOnDevice,
+                    real**                         h_grids,
+                    bool                           computeSplines,
+                    bool                           spreadCharges,
+                    const real                     lambda,
+                    const bool                     useGpuDirectComm,
+                    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
  {
      GMX_ASSERT(
              pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
  {
      GMX_ASSERT(
              pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
@@ -1350,6 +1356,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
  
      PmeStage                           timingId;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
  
      PmeStage                           timingId;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    const bool writeGlobalOrSaveSplines          = writeGlobal || (!recalculateSplines);
      if (computeSplines)
      {
          if (spreadCharges)
      if (computeSplines)
      {
          if (spreadCharges)
@@ -1357,7 +1364,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
              timingId  = PmeStage::SplineAndSpread;
              kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
                                                         pmeGpu->settings.threadsPerAtom,
              timingId  = PmeStage::SplineAndSpread;
              kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
                                                         pmeGpu->settings.threadsPerAtom,
-                                                       writeGlobal || (!recalculateSplines),
+                                                       writeGlobalOrSaveSplines,
                                                         pmeGpu->common->ngrids);
          }
          else
                                                         pmeGpu->common->ngrids);
          }
          else
@@ -1365,43 +1372,116 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
              timingId  = PmeStage::Spline;
              kernelPtr = selectSplineKernelPtr(pmeGpu,
                                                pmeGpu->settings.threadsPerAtom,
              timingId  = PmeStage::Spline;
              kernelPtr = selectSplineKernelPtr(pmeGpu,
                                                pmeGpu->settings.threadsPerAtom,
-                                              writeGlobal || (!recalculateSplines),
+                                              writeGlobalOrSaveSplines,
                                                pmeGpu->common->ngrids);
          }
      }
      else
      {
          timingId  = PmeStage::Spread;
                                                pmeGpu->common->ngrids);
          }
      }
      else
      {
          timingId  = PmeStage::Spread;
-        kernelPtr = selectSpreadKernelPtr(pmeGpu,
-                                          pmeGpu->settings.threadsPerAtom,
-                                          writeGlobal || (!recalculateSplines),
-                                          pmeGpu->common->ngrids);
+        kernelPtr = selectSpreadKernelPtr(
+                pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
      }
  
  
      pme_gpu_start_timing(pmeGpu, timingId);
      auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
      }
  
  
      pme_gpu_start_timing(pmeGpu, timingId);
      auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+    kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+                                   && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+                                   && !writeGlobalOrSaveSplines;
+    if (kernelParamsPtr->usePipeline)
+    {
+        int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+        for (int i = 0; i < numStagesInPipeline; i++)
+        {
+            int senderRank;
+            if (useGpuDirectComm)
+            {
+                senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+                        i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            }
+            else
+            {
+                senderRank = i;
+            }
+
+            // set kernel configuration options specific to this stage of the pipeline
+            std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+                    pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+            const int blockCount       = static_cast<int>(std::ceil(
+                    static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+                    / atomsPerBlock));
+            auto      dimGrid          = pmeGpuCreateGrid(pmeGpu, blockCount);
+            config.gridSize[0]         = dimGrid.first;
+            config.gridSize[1]         = dimGrid.second;
+            DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
  #if c_canEmbedBuffers
  #if c_canEmbedBuffers
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+            const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
  #else
  #else
-    const auto kernelArgs =
-            prepareGpuKernelArguments(kernelPtr,
-                                      config,
-                                      kernelParamsPtr,
-                                      &kernelParamsPtr->atoms.d_theta,
-                                      &kernelParamsPtr->atoms.d_dtheta,
-                                      &kernelParamsPtr->atoms.d_gridlineIndices,
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
-                                      &kernelParamsPtr->grid.d_fractShiftsTable,
-                                      &kernelParamsPtr->grid.d_gridlineIndicesTable,
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
-                                      &kernelParamsPtr->atoms.d_coordinates);
+            const auto kernelArgs =
+                    prepareGpuKernelArguments(kernelPtr,
+                                              config,
+                                              kernelParamsPtr,
+                                              &kernelParamsPtr->atoms.d_theta,
+                                              &kernelParamsPtr->atoms.d_dtheta,
+                                              &kernelParamsPtr->atoms.d_gridlineIndices,
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                              &kernelParamsPtr->grid.d_fractShiftsTable,
+                                              &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                              &kernelParamsPtr->atoms.d_coordinates);
  #endif
  
  #endif
  
-    launchGpuKernel(
-            kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+            launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+        }
+        // Set dependencies for PME stream on all pipeline streams
+        for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+        {
+            GpuEventSynchronizer event;
+            event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+        }
+    }
+    else // pipelining is not in use
+    {
+        if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+        {
+            pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+        }
+
+#if c_canEmbedBuffers
+        const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+        const auto kernelArgs =
+                prepareGpuKernelArguments(kernelPtr,
+                                          config,
+                                          kernelParamsPtr,
+                                          &kernelParamsPtr->atoms.d_theta,
+                                          &kernelParamsPtr->atoms.d_dtheta,
+                                          &kernelParamsPtr->atoms.d_gridlineIndices,
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                          &kernelParamsPtr->grid.d_fractShiftsTable,
+                                          &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                          &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+        launchGpuKernel(kernelPtr,
+                        config,
+                        pmeGpu->archSpecific->pmeStream_,
+                        timingEvent,
+                        "PME spline/spread",
+                        kernelArgs);
+    }
+
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      const auto& settings    = pmeGpu->settings;
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      const auto& settings    = pmeGpu->settings;
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 7baa6bd3475eb1a79ff5e7f498c0554a7eb5f735..0a6ee2a5d29a1b901e85bf828421c9c8f048d92a 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -340,21 +340,30 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
  /*! \libinternal \brief
   * A GPU spline computation and charge spreading function.
   *
  /*! \libinternal \brief
   * A GPU spline computation and charge spreading function.
   *
- * \param[in]  pmeGpu                 The PME GPU structure.
- * \param[in]  xReadyOnDevice         Event synchronizer indicating that the coordinates are ready in the device memory;
- *                                    can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[out] h_grids                The host-side grid buffers (used only if the result of the spread is expected on the host,
- *                                    e.g. testing or host-side FFT)
- * \param[in]  computeSplines         Should the computation of spline parameters and gridline indices be performed.
- * \param[in]  spreadCharges          Should the charges/coefficients be spread on the grid.
- * \param[in]  lambda                 The lambda value of the current system state.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu*         GPU_FUNC_ARGUMENT(pmeGpu),
-                                       GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                       float**               GPU_FUNC_ARGUMENT(h_grids),
-                                       bool                  GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges),
-                                       real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+ * \param[in]  pmeGpu                    The PME GPU structure.
+ * \param[in]  xReadyOnDevice            Event synchronizer indicating that the coordinates are
+ *                                       ready in the device memory; can be nullptr when invoked
+ *                                       on a separate PME rank or from PME tests.
+ * \param[out] h_grids                   The host-side grid buffers (used only if the result
+ *                                       of the spread is expected on the host, e.g. testing
+ *                                       or host-side FFT)
+ * \param[in]  computeSplines            Should the computation of spline parameters and gridline
+ *                                       indices be performed.
+ * \param[in]  spreadCharges             Should the charges/coefficients be spread on the grid.
+ * \param[in]  lambda                    The lambda value of the current system state.
+ * \param[in]  useGpuDirectComm          Whether direct GPU PME-PP communication is active
+ * \param[in]  pmeCoordinateReceiverGpu  Coordinate receiver object, which must be valid when
+ *                                       direct GPU PME-PP communication is active
+ */
+GPU_FUNC_QUALIFIER void
+pme_gpu_spread(const PmeGpu*                  GPU_FUNC_ARGUMENT(pmeGpu),
+               GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
+               float**                        GPU_FUNC_ARGUMENT(h_grids),
+               bool                           GPU_FUNC_ARGUMENT(computeSplines),
+               bool                           GPU_FUNC_ARGUMENT(spreadCharges),
+               real                           GPU_FUNC_ARGUMENT(lambda),
+               const bool                     GPU_FUNC_ARGUMENT(useGpuDirectComm),
+               gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
  
  /*! \libinternal \brief
   * 3D FFT R2C/C2R routine.
  
  /*! \libinternal \brief
   * 3D FFT R2C/C2R routine.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h

index e2c067390ad2c82f28d514e1951ba00d0a3595d9..76e111e7656dee2425874980843a6671d559fa89 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -224,6 +224,15 @@ struct PmeGpuKernelParamsBase
       * before launching spreading.
       */
      struct PmeGpuDynamicParams current;
       * before launching spreading.
       */
      struct PmeGpuDynamicParams current;
+
+    /*! \brief Whether pipelining with PP communications is active
+     * char rather than bool to avoid problem with OpenCL compiler */
+    char usePipeline;
+    /*! \brief Start atom for this stage of pipeline */
+    int pipelineAtomStart;
+    /*! \brief End atom for this stage of pipeline */
+    int pipelineAtomEnd;
+
      /* These texture objects are only used in CUDA and are related to the grid size. */
      /*! \brief Texture object for accessing grid.d_fractShiftsTable */
      HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
      /* These texture objects are only used in CUDA and are related to the grid size. */
      /*! \brief Texture object for accessing grid.d_fractShiftsTable */
      HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 3151ec1ca719e906edeae6acc140d6a03f73b51a..56002966be057185ae0bf3924073cd6962b43759 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -218,6 +218,9 @@ static gmx_pme_t* gmx_pmeonly_switch(std::vector<gmx_pme_t*>* pmedata,
  }
  
  /*! \brief Called by PME-only ranks to receive coefficients and coordinates
  }
  
  /*! \brief Called by PME-only ranks to receive coefficients and coordinates
+ *
+ * Note that with GPU direct communication the transfer is only initiated, it is the responsibility
+ * of the caller to synchronize prior to launching spread.
   *
   * \param[in] pme                     PME data structure.
   * \param[in,out] pme_pp              PME-PP communication structure.
   *
   * \param[in] pme                     PME data structure.
   * \param[in,out] pme_pp              PME-PP communication structure.
@@ -438,9 +441,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                                 "GPU Direct PME-PP communication has been enabled, "
                                 "but PME run mode is not PmeRunMode::GPU\n");
  
                                 "GPU Direct PME-PP communication has been enabled, "
                                 "but PME run mode is not PmeRunMode::GPU\n");
  
-                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
-                    pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
-                            stateGpu->getCoordinates());
+                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers.
+                    pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates());
                      pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
                  }
              }
                      pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
                  }
              }
@@ -495,11 +497,6 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                  }
              }
  
                  }
              }
  
-            if (pme_pp->useGpuDirectComm)
-            {
-                pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
-            }
-
              status = pmerecvqxX;
          }
  
              status = pmerecvqxX;
          }
  
@@ -673,9 +670,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
          if (useGpuPmePpCommunication)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
          if (useGpuPmePpCommunication)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
-                    pme_pp->mpi_comm_mysim,
-                    pme_pp->ppRanks);
+                    pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks);
              pme_pp->pmeForceSenderGpu =
                      std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
                                                               pme_pp->mpi_comm_mysim,
              pme_pp->pmeForceSenderGpu =
                      std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
                                                               pme_pp->mpi_comm_mysim,
@@ -775,7 +770,12 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
              // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
              auto xReadyOnDevice = nullptr;
  
              // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
              auto xReadyOnDevice = nullptr;
  
-            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
+            pme_gpu_launch_spread(pme,
+                                  xReadyOnDevice,
+                                  wcycle,
+                                  lambda_q,
+                                  pme_pp->useGpuDirectComm,
+                                  pme_pp->pmeCoordinateReceiverGpu.get());
              pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
              pme_gpu_launch_gather(pme, wcycle, lambda_q);
              output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
              pme_gpu_launch_gather(pme, wcycle, lambda_q);
              output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index d0856602a9e4574e0081a8231f8ec3073e6d2d12..f5ba3451817b105856e4a4b7e803e5425b8c027e 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -200,7 +200,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
      float  atomCharge;
  
      const int blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
      float  atomCharge;
  
      const int blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
-    const int atomIndexOffset = blockIndex * atomsPerBlock;
+    const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart;
  
      /* Thread index w.r.t. block */
      const int threadLocalId =
  
      /* Thread index w.r.t. block */
      const int threadLocalId =
@@ -225,8 +225,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
      /* Charges, required for both spline and spread */
      if (c_useAtomDataPrefetch)
      {
      /* Charges, required for both spline and spread */
      if (c_useAtomDataPrefetch)
      {
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
-                                                         kernelParams.atoms.d_coefficients[0]);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(
+                sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]);
          __syncthreads();
          atomCharge = sm_coefficients[atomIndexLocal];
      }
          __syncthreads();
          atomCharge = sm_coefficients[atomIndexLocal];
      }
@@ -237,7 +237,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
  
      if (computeSplines)
      {
  
      if (computeSplines)
      {
-        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
+        const float3* __restrict__ gm_coordinates =
+                asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]);
          if (c_useAtomDataPrefetch)
          {
              // Coordinates
          if (c_useAtomDataPrefetch)
          {
              // Coordinates
@@ -274,8 +275,12 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
      /* Spreading */
      if (spreadCharges)
      {
      /* Spreading */
      if (spreadCharges)
      {
-        spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
-                kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+
+        if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+        {
+            spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
+                    kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+        }
      }
      if (numGrids == 2)
      {
      }
      if (numGrids == 2)
      {
@@ -293,8 +298,11 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
          }
          if (spreadCharges)
          {
          }
          if (spreadCharges)
          {
-            spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
-                    kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+            if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+            {
+                spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
+                        kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+            }
          }
      }
  }
          }
      }
  }
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index a3c4409fca0ac70314de02aa2217d452999f8d2b..0016cd80241e43f304af1595ddd7fab1b694d799 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -70,6 +70,7 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
  #include "testutils/test_hardware_environment.h"
  #include "testutils/testasserts.h"
  
  #include "testutils/test_hardware_environment.h"
  #include "testutils/testasserts.h"
@@ -342,7 +343,18 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme,
              const real lambdaQ = 1.0;
              // no synchronization needed as x is transferred in the PME stream
              GpuEventSynchronizer* xReadyOnDevice = nullptr;
              const real lambdaQ = 1.0;
              // no synchronization needed as x is transferred in the PME stream
              GpuEventSynchronizer* xReadyOnDevice = nullptr;
-            pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ);
+
+            bool                           useGpuDirectComm         = false;
+            gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+            pme_gpu_spread(pme->gpu,
+                           xReadyOnDevice,
+                           fftgrid,
+                           computeSplines,
+                           spreadCharges,
+                           lambdaQ,
+                           useGpuDirectComm,
+                           pmeCoordinateReceiverGpu);
          }
          break;
  #endif
          }
          break;
  #endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 241cc3b9fc3036f0267a7439981ce462c65aa328..a2ccaed47b15b75440558e9d7ad4f433f519a765 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -54,6 +54,7 @@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  #include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
@@ -746,7 +747,10 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                        gmx_wallcycle*        wcycle)
  {
      pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
                                        gmx_wallcycle*        wcycle)
  {
      pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+    bool                           useGpuDirectComm         = false;
+    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+    pme_gpu_launch_spread(
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
author	Alan Gray <alangray3@gmail.com>
	Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 14 Oct 2021 13:29:40 +0000 (13:29 +0000)
docs/release-notes/2022/major/performance.rst		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history