Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / pme_coordinate_receiver_gpu.h
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h

index 144bd27fddcf042366ffcfd609277ceb948c859a..d285c1995439c76a0982dcc420c881c4f4b114f7 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -42,12 +42,15 @@
  #ifndef GMX_PMECOORDINATERECEIVERGPU_H
  #define GMX_PMECOORDINATERECEIVERGPU_H
  
+#include <memory>
+
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
-#include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceStream;
+class DeviceContext;
+
  struct PpRanks;
  
  namespace gmx
@@ -61,34 +64,83 @@ class PmeCoordinateReceiverGpu
  
  public:
      /*! \brief Creates PME GPU coordinate receiver object
-     * \param[in] pmeStream       CUDA stream used for PME computations
+     *
+     * For multi-GPU runs, the PME GPU can receive coordinates from
+     * multiple PP GPUs. Data from these distinct communications can
+     * be handled separately in the PME spline/spread kernel, allowing
+     * pipelining which overlaps computation and communication. The
+     * class methods are designed to called seperately for each remote
+     * PP rank, and internally a different stream is used for each
+     * remote PP rank to allow overlapping.
+     *
       * \param[in] comm            Communicator used for simulation
+     * \param[in] deviceContext   GPU context
       * \param[in] ppRanks         List of PP ranks
       */
-    PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeCoordinateReceiverGpu(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<PpRanks> ppRanks);
      ~PmeCoordinateReceiverGpu();
  
      /*! \brief
+     * Re-initialize: set atom ranges and, for thread-MPI case,
       * send coordinates buffer address to PP rank
+     * This is required after repartitioning since atom ranges and
+     * buffer allocations may have changed.
       * \param[in] d_x   coordinates buffer in GPU memory
       */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+    void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
+
+
+    /*! \brief
+     * Receive coordinate synchronizer pointer from the PP ranks.
+     * \param[in] ppRank  PP rank to receive the synchronizer from.
+     */
+    void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
+
+    /*! \brief
+     * Used for lib MPI, receives co-ordinates from PP ranks
+     * \param[in] recvbuf   coordinates buffer in GPU memory
+     * \param[in] numAtoms  starting element in buffer
+     * \param[in] numBytes  number of bytes to transfer
+     * \param[in] ppRank    PP rank to send data
+     */
+    void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+    /*! \brief
+     * For lib MPI, wait for coordinates from any PP rank
+     * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+     * rank determined from pipeline stage into given stream
+     * \param[in] pipelineStage  stage of pipeline corresponding to this transfer
+     * \param[in] deviceStream   stream in which to enqueue the wait event.
+     * \returns                  rank of sending PP task
+     */
+    int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
  
+    /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+     * enqueueing all events to a single stream
+     * \param[in] deviceStream   stream in which to enqueue the wait events.
+     */
+    void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+    /*! \brief
+     * Return pointer to stream associated with specific PP rank sender index
+     * \param[in] senderIndex    Index of sender PP rank.
+     */
+    DeviceStream* ppCommStream(int senderIndex);
  
      /*! \brief
-     * launch receive of coordinate data from PP rank
-     * \param[in] ppRank  PP rank to send data
+     * Returns range of atoms involved in communication associated with specific PP rank sender
+     * index \param[in] senderIndex    Index of sender PP rank.
       */
-    void launchReceiveCoordinatesFromPpCudaDirect(int ppRank);
+    std::tuple<int, int> ppCommAtomRange(int senderIndex);
  
      /*! \brief
-     * enqueue wait for coordinate data from PP ranks
+     * Return number of PP ranks involved in PME-PP communication
       */
-    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+    int ppCommNumSenderRanks();
  
  private:
      class Impl;
-    gmx::PrivateImplPointer<Impl> impl_;
+    std::unique_ptr<Impl> impl_;
  };
  
  } // namespace gmx