Remove thread-MPI limitation for GPU direct PME-PP communication

[alexxy/gromacs.git] / src / gromacs / ewald / pme_force_sender_gpu_impl.h
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 9ff0a15a9ad0c92ae47907c20e7579c0fcb9ea0e..0e0ad8122cbdccf5ce6bcd3bbd1d73570f1ae700 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -74,11 +74,21 @@ public:
      void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
  
      /*! \brief
-     * Send force synchronizer to PP rank
+     * Send force synchronizer to PP rank (used with Thread-MPI)
       * \param[in] ppRank           PP rank to receive data
       */
      void sendFSynchronizerToPpCudaDirect(int ppRank);
  
+    /*! \brief
+     * Send force to PP rank (used with Lib-MPI)
+     * \param[in] sendbuf  force buffer in GPU memory
+     * \param[in] offset   starting element in buffer
+     * \param[in] numBytes number of bytes to transfer
+     * \param[in] ppRank   PP rank to receive data
+     * \param[in] request  MPI request to track asynchronous MPI call status
+     */
+    void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
  private:
      //! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
      GpuEventSynchronizer* pmeForcesReady_;