Make DeviceStream into a class

author Artem Zhmurov <zhmurov@gmail.com>

Tue, 18 Feb 2020 00:36:38 +0000 (01:36 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Fri, 13 Mar 2020 07:36:27 +0000 (08:36 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Tue, 18 Feb 2020 00:36:38 +0000 (01:36 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Fri, 13 Mar 2020 07:36:27 +0000 (08:36 +0100)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index e020a1405ba3dd9069e2c30d3e2f3be8e5344eef..ebcc92bf2ea1949d794860cc22f0db393771fefb 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3203,8 +3203,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
  void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
                                const t_commrec&     cr,
                                const DeviceContext& deviceContext,
-                              void*                streamLocal,
-                              void*                streamNonLocal)
+                              const DeviceStream&  streamLocal,
+                              const DeviceStream&  streamNonLocal)
  {
  
      int gpuHaloExchangeSize = 0;
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index 0a7aa3202ef0c6eb57957a3540f95671bb96592a..da617607a64a1acf500d3c83338e4da823beced7 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -84,6 +84,7 @@ struct t_mdatoms;
  struct t_nrnb;
  struct gmx_wallcycle;
  enum class PbcType : int;
+class DeviceStream;
  class t_state;
  class DeviceContext;
  class GpuEventSynchronizer;
@@ -322,8 +323,8 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
  void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
                                const t_commrec&     cr,
                                const DeviceContext& deviceContext,
-                              void*                streamLocal,
-                              void*                streamNonLocal);
+                              const DeviceStream&  streamLocal,
+                              const DeviceStream&  streamNonLocal);
  
  /*! \brief
   * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index 851e3d19833031fd2e669039ae1731eaa8dd4365..b20ad0a1808003f32255ddfe86208312f57bdb13 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -50,6 +50,7 @@
  
  struct gmx_domdec_t;
  class DeviceContext;
+class DeviceStream;
  class GpuEventSynchronizer;
  
  namespace gmx
@@ -89,8 +90,8 @@ public:
      GpuHaloExchange(gmx_domdec_t*        dd,
                      MPI_Comm             mpi_comm_mysim,
                      const DeviceContext& deviceContext,
-                    void*                streamLocal,
-                    void*                streamNonLocal,
+                    const DeviceStream&  streamLocal,
+                    const DeviceStream&  streamNonLocal,
                      int                  pulse);
      ~GpuHaloExchange();
  
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index c8ca5df8c21381c18dfcf7d86e97a9029fb920ff..1e22b4016eef582c1ac4ba0b7e8697328ef9a1f0 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -63,8 +63,8 @@ class GpuHaloExchange::Impl
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                   MPI_Comm /* mpi_comm_mysim */,
                                   const DeviceContext& /* deviceContext */,
-                                 void* /*streamLocal */,
-                                 void* /*streamNonLocal */,
+                                 const DeviceStream& /*streamLocal */,
+                                 const DeviceStream& /*streamNonLocal */,
                                   int /*pulse */) :
      impl_(nullptr)
  {
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 4a44beb3e69945b3fc2517d38edbec9b656e716e..0829800111069b96822579e3b486b4750f1af0ff 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -135,7 +135,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      d_x_ = d_coordinatesBuffer;
      d_f_ = d_forcesBuffer;
  
-    cudaStream_t                 stream  = nonLocalStream_;
      const gmx_domdec_comm_t&     comm    = *dd_->comm;
      const gmx_domdec_comm_dim_t& cd      = comm.cd[0];
      const gmx_domdec_ind_t&      ind     = cd.ind[pulse_];
@@ -167,7 +166,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
      std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
  
-    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream,
+    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
                         GpuApiCallBehavior::Async, nullptr);
  
      // This rank will push data to its neighbor, so needs to know
@@ -215,7 +214,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
      config.gridSize[1]      = 1;
      config.gridSize[2]      = 1;
      config.sharedMemorySize = 0;
-    config.stream           = nonLocalStream_;
+    config.stream           = nonLocalStream_.stream();
  
      const float3* sendBuf  = d_sendBuf_;
      const float3* d_x      = d_x_;
@@ -264,7 +263,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
          if (!accumulateForces)
          {
              // Clear local portion of force array (in local stream)
-            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
          }
  
          // ensure non-local stream waits for local stream, due to dependence on
@@ -286,7 +285,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      config.gridSize[1]      = 1;
      config.gridSize[2]      = 1;
      config.sharedMemorySize = 0;
-    config.stream           = nonLocalStream_;
+    config.stream           = nonLocalStream_.stream();
  
      const float3* recvBuf  = d_recvBuf_;
      const int*    indexMap = d_indexMap_;
@@ -373,8 +372,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
                                                                int   recvRank)
  {
  
-    cudaError_t  stat;
-    cudaStream_t stream = nonLocalStream_;
+    cudaError_t stat;
  
      // We asynchronously push data to remote rank. The remote
      // destination pointer has already been set in the init fn.  We
@@ -386,7 +384,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
      if (sendSize > 0)
      {
          stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
-                               cudaMemcpyDeviceToDevice, stream);
+                               cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
          CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
      }
  
@@ -397,13 +395,13 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
      // to its stream.
      GpuEventSynchronizer* haloDataTransferRemote;
  
-    haloDataTransferLaunched_->markEvent(stream);
+    haloDataTransferLaunched_->markEvent(nonLocalStream_);
  
      MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
                   &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
                   mpi_comm_mysim_, MPI_STATUS_IGNORE);
  
-    haloDataTransferRemote->enqueueWaitEvent(stream);
+    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
  #else
      GMX_UNUSED_VALUE(sendRank);
      GMX_UNUSED_VALUE(recvRank);
@@ -419,8 +417,8 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
  GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                              MPI_Comm             mpi_comm_mysim,
                              const DeviceContext& deviceContext,
-                            void*                localStream,
-                            void*                nonLocalStream,
+                            const DeviceStream&  localStream,
+                            const DeviceStream&  nonLocalStream,
                              int                  pulse) :
      dd_(dd),
      sendRankX_(dd->neighbor[0][1]),
@@ -431,8 +429,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
      haloDataTransferLaunched_(new GpuEventSynchronizer()),
      mpi_comm_mysim_(mpi_comm_mysim),
      deviceContext_(deviceContext),
-    localStream_(*static_cast<cudaStream_t*>(localStream)),
-    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+    localStream_(localStream),
+    nonLocalStream_(nonLocalStream),
      pulse_(pulse)
  {
  
@@ -466,8 +464,8 @@ GpuHaloExchange::Impl::~Impl()
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                   MPI_Comm             mpi_comm_mysim,
                                   const DeviceContext& deviceContext,
-                                 void*                localStream,
-                                 void*                nonLocalStream,
+                                 const DeviceStream&  localStream,
+                                 const DeviceStream&  nonLocalStream,
                                   int                  pulse) :
      impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
  {
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index ba22bc5262abcc5551dd66e1978e9b59c11a3d16..9a033ed54f96d88e38f06c0f2864f8a0a9b7029f 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -79,8 +79,8 @@ public:
      Impl(gmx_domdec_t*        dd,
           MPI_Comm             mpi_comm_mysim,
           const DeviceContext& deviceContext,
-         void*                localStream,
-         void*                nonLocalStream,
+         const DeviceStream&  localStream,
+         const DeviceStream&  nonLocalStream,
           int                  pulse);
      ~Impl();
  
@@ -185,9 +185,9 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! CUDA stream for local non-bonded calculations
-    cudaStream_t localStream_ = nullptr;
+    const DeviceStream& localStream_;
      //! CUDA stream for non-local non-bonded calculations
-    cudaStream_t nonLocalStream_ = nullptr;
+    const DeviceStream& nonLocalStream_;
      //! full coordinates buffer in GPU memory
      float3* d_x_ = nullptr;
      //! full forces buffer in GPU memory
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 1c3cb9b77460c2e2803914c93b1e0b6e7465e66d..8aa2c079b130b2d9047cbb6e21025f0dab37fe0c 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -72,6 +72,7 @@ struct gmx_wallcycle;
  struct NumPmeDomains;
  
  class DeviceContext;
+class DeviceStream;
  enum class GpuTaskCompletion;
  class PmeGpuProgram;
  class GpuEventSynchronizer;
@@ -433,7 +434,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT
   *  \param[in] pme            The PME data structure.
   *  \returns                  Pointer to GPU stream object.
   */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h

index b5d02a719e2f813ae3f228ff3dc659d256a89667..144bd27fddcf042366ffcfd609277ceb948c859a 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -47,6 +47,7 @@
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/gmxmpi.h"
  
+class DeviceStream;
  struct PpRanks;
  
  namespace gmx
@@ -64,7 +65,7 @@ public:
       * \param[in] comm            Communicator used for simulation
       * \param[in] ppRanks         List of PP ranks
       */
-    PmeCoordinateReceiverGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
      ~PmeCoordinateReceiverGpu();
  
      /*! \brief
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp

index 0cb848e6c06ff8b557c6a2cd05b285e641d30159..b0da71cc479e543792da77de056f87a81f90ebb1 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -62,7 +62,7 @@ class PmeCoordinateReceiverGpu::Impl
  };
  
  /*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
                                                     MPI_Comm /* comm */,
                                                     gmx::ArrayRef<PpRanks> /* ppRanks */) :
      impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index b2e7fa009d70b73536e702f916b3dfb6e5890f10..db81fb7b0aad4d80f401cf4a2a4c2ce4cf5124d7 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -55,8 +55,10 @@
  namespace gmx
  {
  
-PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
+                                     MPI_Comm               comm,
+                                     gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeStream_(pmeStream),
      comm_(comm),
      ppRanks_(ppRanks)
  {
@@ -122,7 +124,7 @@ void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDire
      }
  }
  
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void*            pmeStream,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream&    pmeStream,
                                                     MPI_Comm               comm,
                                                     gmx::ArrayRef<PpRanks> ppRanks) :
      impl_(new Impl(pmeStream, comm, ppRanks))
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h

index 4f3bbe2e4e4478f29f27e9c3597c422a36f172f8..e1186a2f3abf711aa1feea710b2ffda0c1dc1325 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -62,7 +62,7 @@ public:
       * \param[in] comm            Communicator used for simulation
       * \param[in] ppRanks         List of PP ranks
       */
-    Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
      ~Impl();
  
      /*! \brief
@@ -84,7 +84,7 @@ public:
  
  private:
      //! CUDA stream for PME operations
-    cudaStream_t pmeStream_ = nullptr;
+    const DeviceStream& pmeStream_;
      //! communicator for simulation
      MPI_Comm comm_;
      //! list of PP ranks
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h

index c774994824bba46fb8e22833a2242eeb98d420f6..df8e1873f8ecb4e9b36df823778be1fea4c404e3 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -46,6 +46,8 @@
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/gmxmpi.h"
  
+class DeviceStream;
+
  /*! \libinternal
   * \brief Contains information about the PP ranks that partner this PME rank. */
  struct PpRanks
@@ -73,7 +75,7 @@ public:
       * \param[in] comm            Communicator used for simulation
       * \param[in] ppRanks         List of PP ranks
       */
-    PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
      ~PmeForceSenderGpu();
  
      /*! \brief
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp

index b4a531fed95429c3e43b091c358a8e2818728758..3ae502ccd8373589a330ddf9c801d5490a61e221 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -61,7 +61,7 @@ class PmeForceSenderGpu::Impl
  };
  
  /*!\brief Constructor stub. */
-PmeForceSenderGpu::PmeForceSenderGpu(const void* /*pmeStream */,
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
                                       MPI_Comm /* comm     */,
                                       gmx::ArrayRef<PpRanks> /* ppRanks */) :
      impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index 0ad8fbf12357a84ec2bf3a005a1f23a32d38417d..6e6d21eaf2ee7b98f338e51b6877de2bbb4cebfc 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -55,8 +55,8 @@ namespace gmx
  {
  
  /*! \brief Create PME-PP GPU communication object */
-PmeForceSenderGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeStream_(pmeStream),
      comm_(comm),
      ppRanks_(ppRanks)
  {
@@ -106,7 +106,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank)
  #endif
  }
  
-PmeForceSenderGpu::PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
+                                     MPI_Comm               comm,
+                                     gmx::ArrayRef<PpRanks> ppRanks) :
      impl_(new Impl(pmeStream, comm, ppRanks))
  {
  }
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index 5c1271b2dd6338b02af14eb33c71c1144d110698..91fe1c1140d271651bd9f9bf40c248e9f7385b70 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -61,7 +61,7 @@ public:
       * \param[in] comm            Communicator used for simulation
       * \param[in] ppRanks         List of PP ranks
       */
-    Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
      ~Impl();
  
      /*! \brief
@@ -78,7 +78,7 @@ public:
  
  private:
      //! CUDA stream for PME operations
-    cudaStream_t pmeStream_ = nullptr;
+    const DeviceStream& pmeStream_;
      //! Event triggered when to allow remote PP stream to syn with pme stream
      GpuEventSynchronizer pmeSync_;
      //! communicator for simulation
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 4c4ed4851d8cea883a6b1531a6fdce4296276c45..cbcab23b1a16dce5c7b5a0e9ab6e62d1009df7f1 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -433,7 +433,7 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
      pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
  }
  
-void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
+const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
  {
      if (!pme || !pme_gpu_active(pme))
      {
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu

index 2b30dcdef14899569b81aead9c14501665068cfc..9f9578962e6d59d506bb9e5233adc338ca6aa111 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_3dfft.cu
+++ b/src/gromacs/ewald/pme_gpu_3dfft.cu
@@ -104,7 +104,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                             realGridSizePaddedTotal, CUFFT_C2R, batch);
      handleCufftError(result, "cufftPlanMany C2R plan failure");
  
-    cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
+    cudaStream_t stream = pmeGpu->archSpecific->pmeStream_.stream();
      GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT");
  
      result = cufftSetStream(planR2C_, stream);
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h

index 07d3b1af5712227e44354f4b759d0b90af5bc752..fc6d67a935e33dac4e315a39c8008b46006c4aef 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_3dfft.h
+++ b/src/gromacs/ewald/pme_gpu_3dfft.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -93,7 +93,7 @@ private:
  #elif GMX_GPU == GMX_GPU_OPENCL
      clfftPlanHandle               planR2C_;
      clfftPlanHandle               planC2R_;
-    std::vector<cl_command_queue> commandStreams_;
+    std::vector<cl_command_queue> deviceStreams_;
      cl_mem                        realGrid_;
      cl_mem                        complexGrid_;
  #endif
diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp

index c6e1b6448cc6093985b6e664f07fc6609f57a37a..b341a27b829f166be3791edca6489494521d8363 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
@@ -81,7 +81,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                     "Complex padding not implemented");
      }
      cl_context context = pmeGpu->archSpecific->deviceContext_.context();
-    commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
+    deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream());
      realGrid_                       = kernelParamsPtr->grid.d_realGrid;
      complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid;
      const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
@@ -124,9 +124,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
      handleClfftError(clfftSetPlanOutStride(planC2R_, dims, realGridStrides.data()),
                       "clFFT stride setting failure");
  
-    handleClfftError(clfftBakePlan(planR2C_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+    handleClfftError(clfftBakePlan(planR2C_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
                       "clFFT precompiling failure");
-    handleClfftError(clfftBakePlan(planC2R_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+    handleClfftError(clfftBakePlan(planC2R_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
                       "clFFT precompiling failure");
  
      // TODO: implement solve kernel as R2C FFT callback
@@ -166,8 +166,8 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingE
              GMX_THROW(
                      gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
      }
-    handleClfftError(clfftEnqueueTransform(plan, direction, commandStreams_.size(),
-                                           commandStreams_.data(), waitEvents.size(), waitEvents.data(),
+    handleClfftError(clfftEnqueueTransform(plan, direction, deviceStreams_.size(),
+                                           deviceStreams_.data(), waitEvents.size(), waitEvents.data(),
                                             timingEvent, inputGrids, outputGrids, tempBuffer),
                       "clFFT execution failure");
  }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index dd62e8c4cdfe4306527066936af4042a2edc4744..822109de4c28b96b25e36d4b36d5a3722599d4aa 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -135,7 +135,7 @@ int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
  
  void pme_gpu_synchronize(const PmeGpu* pmeGpu)
  {
-    gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
+    pmeGpu->archSpecific->pmeStream_.synchronize();
  }
  
  void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
@@ -156,7 +156,7 @@ void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
  void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
  {
      clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
-                           c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
+                           c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
  }
  
  void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
@@ -188,7 +188,7 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
      }
      /* TODO: pin original buffer instead! */
      copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
-                       0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream,
+                       0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_,
                         pmeGpu->settings.transferKind, nullptr);
  }
  
@@ -219,7 +219,7 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
      float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
-                       DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
                         pmeGpu->settings.transferKind, nullptr);
  }
  
@@ -228,7 +228,7 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
      float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
      copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
-                         DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
                           pmeGpu->settings.transferKind, nullptr);
  }
  
@@ -243,7 +243,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
                             pmeGpu->archSpecific->deviceContext_);
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                         const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      if (c_usePadding)
      {
          const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
@@ -251,7 +251,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
          if (paddingCount > 0)
          {
              clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream);
+                                   paddingCount, pmeGpu->archSpecific->pmeStream_);
          }
      }
  }
@@ -360,7 +360,7 @@ void pme_gpu_free_grids(const PmeGpu* pmeGpu)
  void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
  {
      clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
-                           pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream);
+                           pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_);
  }
  
  void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
@@ -393,10 +393,10 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
      allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
                           pmeGpu->archSpecific->deviceContext_);
      copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
                         GpuApiCallBehavior::Async, nullptr);
      copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
                         GpuApiCallBehavior::Async, nullptr);
  #endif
  }
@@ -417,21 +417,21 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
  
  bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
  {
-    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_);
  }
  
  void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
  {
      copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
  }
  
  void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
  {
      copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
-                         pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream,
+                         pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_,
                           pmeGpu->settings.transferKind, nullptr);
-    pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
+    pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
  }
  
  void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
@@ -441,11 +441,11 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
      const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
      auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
      copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
-                         0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+                         0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
                           pmeGpu->settings.transferKind, nullptr);
  }
  
@@ -459,20 +459,20 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
      {
          // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
          clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
-                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
          clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
                                 pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->archSpecific->pmeStream_);
          clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
                                 pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->archSpecific->pmeStream_);
      }
      copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
-                       0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+                       0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
                         pmeGpu->settings.transferKind, nullptr);
  }
  
@@ -529,35 +529,25 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      int         highest_priority, lowest_priority;
      stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
      CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
-    stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
+    cudaStream_t stream;
+    stat = cudaStreamCreateWithPriority(&stream,
                                          cudaStreamDefault, // cudaStreamNonBlocking,
                                          highest_priority);
+    pmeGpu->archSpecific->pmeStream_.setStream(stream);
      CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
  #elif GMX_GPU == GMX_GPU_OPENCL
      cl_command_queue_properties queueProperties =
              pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
      cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
      cl_int       clError;
-    pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
-            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
-    if (clError != CL_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
-    }
-#endif
-}
+    pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue(
+            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError));
+
  
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
-{
-#if GMX_GPU == GMX_GPU_CUDA
-    /* Destroy the CUDA stream */
-    cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "PME cudaStreamDestroy error");
-#elif GMX_GPU == GMX_GPU_OPENCL
-    cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
      if (clError != CL_SUCCESS)
      {
-        gmx_warning("Failed to destroy PME command queue");
+        GMX_THROW(gmx::InternalError(
+                gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str()));
      }
  #endif
  }
@@ -979,9 +969,6 @@ void pme_gpu_destroy(PmeGpu* pmeGpu)
  
      pme_gpu_destroy_3dfft(pmeGpu);
  
-    /* Free the GPU-framework specific data last */
-    pme_gpu_destroy_specific(pmeGpu);
-
      delete pmeGpu;
  }
  
@@ -1205,7 +1192,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
                 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
      if (xReadyOnDevice)
      {
-        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
      }
  
      const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
@@ -1217,7 +1204,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      config.blockSize[2] = atomsPerBlock;
      config.gridSize[0]  = dimGrid.first;
      config.gridSize[1]  = dimGrid.second;
-    config.stream       = pmeGpu->archSpecific->pmeStream;
+    config.stream       = pmeGpu->archSpecific->pmeStream_.stream();
  
      int                                timingId;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1285,7 +1272,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
      if (copyInputAndOutputGrid)
      {
          copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
-                           pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+                           pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
                             pmeGpu->settings.transferKind, nullptr);
      }
  
@@ -1335,7 +1322,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
      config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
                           / gridLinesPerBlock;
      config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
-    config.stream      = pmeGpu->archSpecific->pmeStream;
+    config.stream      = pmeGpu->archSpecific->pmeStream_.stream();
  
      int                                timingId  = gtPME_SOLVE;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1366,13 +1353,13 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
      {
          copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
                               &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
-                             pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                             pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
      }
  
      if (copyInputAndOutputGrid)
      {
          copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
-                             pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+                             pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
                               pmeGpu->settings.transferKind, nullptr);
      }
  }
@@ -1457,7 +1444,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
      config.blockSize[2] = atomsPerBlock;
      config.gridSize[0]  = dimGrid.first;
      config.gridSize[1]  = dimGrid.second;
-    config.stream       = pmeGpu->archSpecific->pmeStream;
+    config.stream       = pmeGpu->archSpecific->pmeStream_.stream();
  
      // TODO test different cache configs
  
@@ -1483,7 +1470,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
  
      if (pmeGpu->settings.useGpuForceReduction)
      {
-        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream_);
      }
      else
      {
@@ -1515,11 +1502,11 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx:
      pmeGpu->kernelParams->atoms.d_coordinates = d_x;
  }
  
-void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
+const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
  {
      if (pmeGpu)
      {
-        return static_cast<void*>(&pmeGpu->archSpecific->pmeStream);
+        return &pmeGpu->archSpecific->pmeStream_;
      }
      else
      {
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 67a1bc3d1c2a53f267ea722f9fe752f87833176d..b515e3b222d5e2520c31280bf6fb341166480465 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -313,14 +313,6 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
   */
  void pme_gpu_init_internal(PmeGpu* pmeGpu);
  
-/*! \libinternal \brief
- * Destroys the PME GPU-framework specific data.
- * Should be called last in the PME GPU destructor.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu);
-
  /*! \libinternal \brief
   * Initializes the CUDA FFT structures.
   *
@@ -405,7 +397,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_A
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to stream object.
   */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu_timings.cpp b/src/gromacs/ewald/pme_gpu_timings.cpp

index 3680631c5731b905809b5b24ad689e6db5ba994b..3a1f45746857d78d5a099b83507c38b47b8c08e7 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_timings.cpp
+++ b/src/gromacs/ewald/pme_gpu_timings.cpp
@@ -61,7 +61,7 @@ void pme_gpu_start_timing(const PmeGpu* pmeGpu, size_t PMEStageId)
      {
          GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
                     "Wrong PME GPU timing event index");
-        pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream_);
      }
  }
  
@@ -71,7 +71,7 @@ void pme_gpu_stop_timing(const PmeGpu* pmeGpu, size_t PMEStageId)
      {
          GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
                     "Wrong PME GPU timing event index");
-        pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream_);
      }
  }
  
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h

index 44ca3fd3c3561c64b467e13527f59b0f8c78c627..a019a7c0310b599ba6dda7c2e4fac04b65355f43 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -74,8 +74,6 @@ struct PmeGpuSpecific
       * \param[in] deviceContext GPU device context.
       */
      PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
-    /*! \brief The GPU stream where everything related to the PME happens. */
-    CommandStream pmeStream;
  
      /*! \brief
       * A handle to the GPU context.
@@ -85,6 +83,9 @@ struct PmeGpuSpecific
       */
      const DeviceContext& deviceContext_;
  
+    /*! \brief The GPU stream where everything related to the PME happens. */
+    DeviceStream pmeStream_;
+
      /* Synchronization events */
      /*! \brief Triggered after the PME Force Calculations have been completed */
      GpuEventSynchronizer pmeForcesReady;
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 845b1a33ecf45b99b78d846f0107533647e4f485..fe51deb5fc4dc322b51db151006f6ad4e1eeadc9 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -629,16 +629,16 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
      const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
      if (useGpuForPme)
      {
-        const void* commandStream = pme_gpu_get_device_stream(pme);
+        const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
  
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
          if (c_enableGpuPmePpComms)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
              pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
          }
          GMX_RELEASE_ASSERT(
                  deviceContext != nullptr,
@@ -646,7 +646,7 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
          // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
          //       This should be made safer.
          stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                commandStream, *deviceContext, GpuApiCallBehavior::Async,
+                &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
                  pme_gpu_get_padding_size(pme), wcycle);
      }
  
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 91962ff947d5b96c813ae2ed1bdf29c2bb165eae..f5aac2981ab844ddeb097c425d8c8b63b568a796 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -64,7 +64,9 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device
      GMX_RELEASE_ASSERT(
              GMX_THREAD_MPI,
              "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-    cudaStreamCreate(&pmePpCommStream_);
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+    pmePpCommStream_.setStream(stream);
  }
  
  PmePpCommGpu::Impl::~Impl() = default;
@@ -98,7 +100,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
      // Pull force data from remote GPU
      void*       pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
      cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, remotePmeFBuffer_, recvSize * DIM * sizeof(float),
-                                       cudaMemcpyDefault, pmePpCommStream_);
+                                       cudaMemcpyDefault, pmePpCommStream_.stream());
      CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
  
      if (receivePmeForceToGpu)
@@ -112,7 +114,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
      {
          // Ensure CPU waits for PME forces to be copied before reducing
          // them with other forces on the CPU
-        cudaStreamSynchronize(pmePpCommStream_);
+        cudaStreamSynchronize(pmePpCommStream_.stream());
      }
  #else
      GMX_UNUSED_VALUE(recvPtr);
@@ -131,7 +133,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
      coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
  
      cudaError_t stat = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr, sendSize * DIM * sizeof(float),
-                                       cudaMemcpyDefault, pmePpCommStream_);
+                                       cudaMemcpyDefault, pmePpCommStream_.stream());
      CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
  
      // Record and send event to allow PME task to sync to above transfer before commencing force calculations
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index c791ea5b4011ace1b20c5ace0a127fbaea0b92fb..934b7c40c62ed1a1daa3eb58f607cb494554d3dd 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -119,7 +119,7 @@ private:
      //! Device context object
      const DeviceContext& deviceContext_;
      //! CUDA stream used for the communication operations in this class
-    cudaStream_t pmePpCommStream_ = nullptr;
+    DeviceStream pmePpCommStream_;
      //! Remote location of PME coordinate data buffer
      void* remotePmeXBuffer_ = nullptr;
      //! Remote location of PME force data buffer
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt

index 8672e450ca175fe46608e8672013980c89808139..ce70d0b049adbb5cc8dc099d144ecfc1c5bb0e64 100644 (file)
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -45,6 +45,7 @@ gmx_add_libgromacs_sources(
  if(GMX_USE_OPENCL)
      gmx_add_libgromacs_sources(
          device_context_ocl.cpp
+        device_stream_ocl.cpp
          gpu_utils_ocl.cpp
          ocl_compiler.cpp
          ocl_caching.cpp
@@ -53,10 +54,15 @@ if(GMX_USE_OPENCL)
  elseif(GMX_USE_CUDA)
      gmx_add_libgromacs_sources(
          cudautils.cu
+        device_stream.cu
          gpu_utils.cu
          pinning.cu
          pmalloc_cuda.cu
          )
+elseif()
+    gmx_add_libgromacs_sources(
+        device_stream.cpp
+    )
  endif()
  
  if (BUILD_TESTING)
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index 71d9b7dac45dac38ac76b96357cdec6b91e0acde..48212bf3bc1293d9d5beedd876e53202e6a4c207 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -216,25 +216,15 @@ static inline void rvec_inc(rvec a, const float3 b)
      rvec_inc(a, tmp);
  }
  
-/*! \brief Wait for all taks in stream \p s to complete.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cudaStream_t s)
-{
-    cudaError_t stat = cudaStreamSynchronize(s);
-    CU_RET_ERR(stat, "cudaStreamSynchronize failed");
-}
-
  /*! \brief  Returns true if all tasks in \p s have completed.
   *
- * \param[in] s stream to check
+ *  \param[in] deviceStream CUDA stream to check.
   *
- *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ *  \returns True if all tasks enqueued in the stream \p deviceStream (at the time of this call) have completed.
   */
-static inline bool haveStreamTasksCompleted(cudaStream_t s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream)
  {
-    cudaError_t stat = cudaStreamQuery(s);
+    cudaError_t stat = cudaStreamQuery(deviceStream.stream());
  
      if (stat == cudaErrorNotReady)
      {
diff --git a/src/gromacs/gpu_utils/device_stream.cpp b/src/gromacs/gpu_utils/device_stream.cpp

new file mode 100644 (file)

index 0000000..1b5b016
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.cpp
@@ -0,0 +1,58 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+DeviceStream::DeviceStream() = default;
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+                        const DeviceContext& /* deviceContext */,
+                        DeviceStreamPriority /* priority */,
+                        const bool /* useTiming */)
+{
+}
+
+DeviceStream::~DeviceStream() = default;
+
+void DeviceStream::synchronize() const {}
+\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/device_stream.cu b/src/gromacs/gpu_utils/device_stream.cu

new file mode 100644 (file)

index 0000000..8d0b484
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.cu
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+    stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+                        const DeviceContext& /* deviceContext */,
+                        DeviceStreamPriority priority,
+                        const bool /* useTiming */)
+{
+    cudaError_t stat;
+
+    if (priority == DeviceStreamPriority::Normal)
+    {
+        stat = cudaStreamCreate(&stream_);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
+        }
+    }
+    else if (priority == DeviceStreamPriority::High)
+    {
+        // Note that the device we're running on does not have to
+        // support priorities, because we are querying the priority
+        // range, which in that case will be a single value.
+        int highestPriority;
+        stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
+                    cudaGetErrorString(stat))));
+        }
+
+        stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
+                    cudaGetErrorString(stat))));
+        }
+    }
+}
+
+DeviceStream::~DeviceStream()
+{
+    if (stream_)
+    {
+        cudaError_t stat = cudaStreamDestroy(stream_);
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
+                                             stat, cudaGetErrorString(stat))
+                                   .c_str());
+        stream_ = nullptr;
+    }
+}
+
+cudaStream_t DeviceStream::stream() const
+{
+    return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+    cudaError_t stat = cudaStreamSynchronize(stream_);
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       gmx::formatString("cudaStreamSynchronize failed  (CUDA error %d: %s).", stat,
+                                         cudaGetErrorString(stat))
+                               .c_str());
+}
+\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/device_stream.h b/src/gromacs/gpu_utils/device_stream.h

new file mode 100644 (file)

index 0000000..2e654e5
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.h
@@ -0,0 +1,134 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_STREAM_H
+#define GMX_GPU_UTILS_DEVICE_STREAM_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceStream class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+#    include "gromacs/gpu_utils/gmxopencl.h"
+#endif
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+class DeviceContext;
+
+//! Enumeration describing the priority with which a stream operates.
+enum class DeviceStreamPriority : int
+{
+    //! High-priority stream
+    High,
+    //! Normal-priority stream
+    Normal,
+    //! Conventional termination of the enumeration
+    Count
+};
+
+// Stub for device context
+class DeviceStream
+{
+public:
+    //! Default constructor
+    DeviceStream();
+    //! Destructor
+    ~DeviceStream();
+
+    /*! \brief Initialize
+     *
+     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
+     * \param[in] deviceContext  Device context (not used in CUDA).
+     * \param[in] priority       Stream priority: high or normal.
+     * \param[in] useTiming      If the timing should be enabled (not used in CUDA).
+     */
+    void init(const DeviceInformation& deviceInfo,
+              const DeviceContext&     deviceContext,
+              DeviceStreamPriority     priority,
+              const bool               useTiming);
+
+    /*! \brief Construct and init.
+     *
+     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
+     * \param[in] deviceContext  Device context (only used in OpenCL).
+     * \param[in] priority       Stream priority: high or normal (only used in CUDA).
+     * \param[in] useTiming      If the timing should be enabled (only used in OpenCL).
+     */
+    DeviceStream(const DeviceInformation& deviceInfo,
+                 const DeviceContext&     deviceContext,
+                 DeviceStreamPriority     priority,
+                 const bool               useTiming)
+    {
+        init(deviceInfo, deviceContext, priority, useTiming);
+    }
+
+    //! Synchronize the steam
+    void synchronize() const;
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+    //! Getter
+    cudaStream_t stream() const;
+    //! Setter (temporary, will be removed in the follow-up)
+    void setStream(cudaStream_t stream) { stream_ = stream; }
+
+private:
+    cudaStream_t stream_ = nullptr;
+
+#elif GMX_GPU == GMX_GPU_OPENCL
+
+    //! Getter
+    cl_command_queue stream() const;
+    //! Setter (temporary, will be removed in the follow-up)
+    void setStream(cl_command_queue stream) { stream_ = stream; }
+
+private:
+    cl_command_queue stream_ = nullptr;
+
+#endif
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceStream);
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_STREAM_H
diff --git a/src/gromacs/gpu_utils/device_stream_ocl.cpp b/src/gromacs/gpu_utils/device_stream_ocl.cpp

new file mode 100644 (file)

index 0000000..013480a
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream_ocl.cpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceStream for OpenCL.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/device_context_ocl.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+    stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& deviceInfo,
+                        const DeviceContext&     deviceContext,
+                        DeviceStreamPriority /* priority */,
+                        const bool useTiming)
+{
+    cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
+    cl_device_id                deviceId        = deviceInfo.oclDeviceId;
+    cl_int                      clError;
+    stream_ = clCreateCommandQueue(deviceContext.context(), deviceId, queueProperties, &clError);
+    if (clError != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError(gmx::formatString(
+                "Failed to create OpenCL command queue on GPU %s (OpenCL error ID %d).",
+                deviceInfo.device_name, clError)));
+    }
+}
+
+DeviceStream::~DeviceStream()
+{
+    if (stream_)
+    {
+        cl_int clError = clReleaseCommandQueue(stream_);
+        GMX_RELEASE_ASSERT(
+                clError == CL_SUCCESS,
+                gmx::formatString("Failed to release OpenCL stream (OpenCL error ID %d).", clError).c_str());
+        stream_ = nullptr;
+    }
+}
+
+cl_command_queue DeviceStream::stream() const
+{
+    return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+    cl_int clError = clFinish(stream_);
+    GMX_RELEASE_ASSERT(
+            CL_SUCCESS == clError,
+            gmx::formatString("Error caught during clFinish (OpenCL error ID %d).", clError).c_str());
+}
+\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

index 59255bfa9332bdfd769f0739094a6452a8bc3688..d4bfe8c35b5b47fc6fd7c3f6190489b99fcc6abf 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -96,7 +96,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
   * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
   * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy into.
   * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
   * \param[in]     transferKind         Copy type: synchronous or asynchronous.
   * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
   *                                     Not used in CUDA implementation.
@@ -106,7 +106,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                          const ValueType*         hostBuffer,
                          size_t                   startingOffset,
                          size_t                   numValues,
-                        CommandStream            stream,
+                        const DeviceStream&      deviceStream,
                          GpuApiCallBehavior       transferKind,
                          CommandEvent* /*timingEvent*/)
  {
@@ -125,7 +125,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
              GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                         "Source host buffer was not pinned for CUDA");
              stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
-                                   cudaMemcpyHostToDevice, stream);
+                                   cudaMemcpyHostToDevice, deviceStream.stream());
              GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
              break;
  
@@ -150,7 +150,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
   * \param[in]     buffer               Pointer to the device-side buffer
   * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy from.
   * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
   * \param[in]     transferKind         Copy type: synchronous or asynchronous.
   * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
   *                                     Not used in CUDA implementation.
@@ -160,7 +160,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                            DeviceBuffer<ValueType>* buffer,
                            size_t                   startingOffset,
                            size_t                   numValues,
-                          CommandStream            stream,
+                          const DeviceStream&      deviceStream,
                            GpuApiCallBehavior       transferKind,
                            CommandEvent* /*timingEvent*/)
  {
@@ -175,7 +175,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
              GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                         "Destination host buffer was not pinned for CUDA");
              stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
-                                   cudaMemcpyDeviceToHost, stream);
+                                   cudaMemcpyDeviceToHost, deviceStream.stream());
              GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
              break;
  
@@ -196,16 +196,20 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
   * \param[in,out] buffer          Pointer to the device-side buffer
   * \param[in]     startingOffset  Offset (in values) at the device-side buffer to start clearing at.
   * \param[in]     numValues       Number of values to clear.
- * \param[in]     stream          GPU stream.
+ * \param[in]     deviceStream    GPU stream.
   */
  template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+                            size_t                   startingOffset,
+                            size_t                   numValues,
+                            const DeviceStream&      deviceStream)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      const size_t bytes   = numValues * sizeof(ValueType);
      const char   pattern = 0;
  
-    cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes, stream);
+    cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
+                                       deviceStream.stream());
      GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
  }
  
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h

index ee1adc1cce89e48958309aa55d4d01c0436a9899..05be260b891c58f1e6ff8b820ce7deb7026d303f 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -108,7 +108,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
   * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
   * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy into.
   * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
   * \param[in]     transferKind         Copy type: synchronous or asynchronous.
   * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
   *                                     If the pointer is not null, the event can further be used
@@ -119,7 +119,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                          const ValueType*         hostBuffer,
                          size_t                   startingOffset,
                          size_t                   numValues,
-                        CommandStream            stream,
+                        const DeviceStream&      deviceStream,
                          GpuApiCallBehavior       transferKind,
                          CommandEvent*            timingEvent)
  {
@@ -135,8 +135,8 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
      switch (transferKind)
      {
          case GpuApiCallBehavior::Async:
-            clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
-                                           nullptr, timingEvent);
+            clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+                                           hostBuffer, 0, nullptr, timingEvent);
              GMX_RELEASE_ASSERT(
                      clError == CL_SUCCESS,
                      gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError,
@@ -145,8 +145,8 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
              break;
  
          case GpuApiCallBehavior::Sync:
-            clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
-                                           nullptr, timingEvent);
+            clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+                                           hostBuffer, 0, nullptr, timingEvent);
              GMX_RELEASE_ASSERT(
                      clError == CL_SUCCESS,
                      gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError,
@@ -168,7 +168,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
   * \param[in]     buffer               Pointer to the device-side buffer
   * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy from.
   * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
   * \param[in]     transferKind         Copy type: synchronous or asynchronous.
   * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
   *                                     If the pointer is not null, the event can further be used
@@ -179,7 +179,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                            DeviceBuffer<ValueType>* buffer,
                            size_t                   startingOffset,
                            size_t                   numValues,
-                          CommandStream            stream,
+                          const DeviceStream&      deviceStream,
                            GpuApiCallBehavior       transferKind,
                            CommandEvent*            timingEvent)
  {
@@ -191,8 +191,8 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
      switch (transferKind)
      {
          case GpuApiCallBehavior::Async:
-            clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
-                                          nullptr, timingEvent);
+            clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+                                          hostBuffer, 0, nullptr, timingEvent);
              GMX_RELEASE_ASSERT(
                      clError == CL_SUCCESS,
                      gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError,
@@ -201,8 +201,8 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
              break;
  
          case GpuApiCallBehavior::Sync:
-            clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
-                                          nullptr, timingEvent);
+            clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+                                          hostBuffer, 0, nullptr, timingEvent);
              GMX_RELEASE_ASSERT(
                      clError == CL_SUCCESS,
                      gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError,
@@ -221,10 +221,13 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
   * \param[in,out] buffer          Pointer to the device-side buffer
   * \param[in]     startingOffset  Offset (in values) at the device-side buffer to start clearing at.
   * \param[in]     numValues       Number of values to clear.
- * \param[in]     stream          GPU stream.
+ * \param[in]     deviceStream    GPU stream.
   */
  template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+                            size_t                   startingOffset,
+                            size_t                   numValues,
+                            const DeviceStream&      deviceStream)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      const size_t    offset        = startingOffset * sizeof(ValueType);
@@ -233,8 +236,8 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
      const cl_uint   numWaitEvents = 0;
      const cl_event* waitEvents    = nullptr;
      cl_event        commandEvent;
-    cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes,
-                                         numWaitEvents, waitEvents, &commandEvent);
+    cl_int clError = clEnqueueFillBuffer(deviceStream.stream(), *buffer, &pattern, sizeof(pattern),
+                                         offset, bytes, numWaitEvents, waitEvents, &commandEvent);
      GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
                         gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)",
                                           clError, ocl_get_error_string(clError).c_str())
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh

index e2e6ac87044ecf5e838874214d49b0dd4e76f23b..69487de58652a4e6cb28c96fc0ac9c1e8b34c0a1 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -80,9 +80,9 @@ public:
      /*! \brief Marks the synchronization point in the \p stream.
       * Should be followed by waitForEvent().
       */
-    inline void markEvent(CommandStream stream)
+    inline void markEvent(const DeviceStream& deviceStream)
      {
-        cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, stream);
+        cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
          GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
      }
      /*! \brief Synchronizes the host thread on the marked event. */
@@ -92,9 +92,9 @@ public:
          GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
      }
      /*! \brief Enqueues a wait for the recorded event in stream \p stream */
-    inline void enqueueWaitEvent(CommandStream stream)
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
      {
-        cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(stream, event_, 0);
+        cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
          GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
      }
  
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h

index b9298f385fcc82ad5173927a8585d23e521660e6..9a62b5b6d47efdab66a56cdfa91377d93ae58501 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -85,10 +85,10 @@ public:
      /*! \brief Marks the synchronization point in the \p stream.
       * Should be called first and then followed by waitForEvent().
       */
-    inline void markEvent(CommandStream stream)
+    inline void markEvent(const DeviceStream& deviceStream)
      {
          GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
-        cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_);
+        cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_);
          if (CL_SUCCESS != clError)
          {
              GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: "
@@ -112,9 +112,9 @@ public:
       *  After enqueue, the associated event is released, so this method should
       *  be only called once per markEvent() call.
       */
-    inline void enqueueWaitEvent(CommandStream stream)
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
      {
-        cl_int clError = clEnqueueBarrierWithWaitList(stream, 1, &event_, nullptr);
+        cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
          if (CL_SUCCESS != clError)
          {
              GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh

index 52e36c038ae999a4c01d1fb2739346041376b16c..c56d60da61dc17f00e29cff4ee1f993dca01dbbd 100644 (file)
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -79,15 +79,17 @@ public:
      GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
  
      /*! \brief Will be called before the region start. */
-    inline void openTimingRegion(CommandStream s)
+    inline void openTimingRegion(const DeviceStream& deviceStream)
      {
-        CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure");
+        CU_RET_ERR(cudaEventRecord(eventStart_, deviceStream.stream()),
+                   "GPU timing recording failure");
      }
  
      /*! \brief Will be called after the region end. */
-    inline void closeTimingRegion(CommandStream s)
+    inline void closeTimingRegion(const DeviceStream& deviceStream)
      {
-        CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure");
+        CU_RET_ERR(cudaEventRecord(eventStop_, deviceStream.stream()),
+                   "GPU timing recording failure");
      }
  
      /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h

index 6798159ac9af5ddfdb717cbb778fa63ccc9acc30..f0860b164d9cbdbe61aade8bf3d5e127c5ed08cc 100644 (file)
--- a/src/gromacs/gpu_utils/gpuregiontimer.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -84,9 +84,9 @@ public:
      /*! \brief
       * To be called before the region start.
       *
-     * \param[in] s   The GPU command stream where the event being measured takes place.
+     * \param[in] deviceStream   The GPU command stream where the event being measured takes place.
       */
-    void openTimingRegion(CommandStream s)
+    void openTimingRegion(const DeviceStream& deviceStream)
      {
          if (c_debugTimerState)
          {
@@ -96,14 +96,14 @@ public:
              GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str());
              debugState_ = TimerState::Recording;
          }
-        impl_.openTimingRegion(s);
+        impl_.openTimingRegion(deviceStream);
      }
      /*! \brief
       * To be called after the region end.
       *
-     * \param[in] s   The GPU command stream where the event being measured takes place.
+     * \param[in] deviceStream   The GPU command stream where the event being measured takes place.
       */
-    void closeTimingRegion(CommandStream s)
+    void closeTimingRegion(const DeviceStream& deviceStream)
      {
          if (c_debugTimerState)
          {
@@ -114,7 +114,7 @@ public:
              debugState_ = TimerState::Stopped;
          }
          callCount_++;
-        impl_.closeTimingRegion(s);
+        impl_.closeTimingRegion(deviceStream);
      }
      /*! \brief
       * Accumulates the last timespan of all the events used into the total duration,
diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h

index 3c1d9b2b84b12fd6970b1d32234e1df227f77d1c..788e41de5d4b97a829c0fc5e4a51293d0dcb1cb8 100644 (file)
--- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -82,9 +82,9 @@ public:
      GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
  
      /*! \brief Should be called before the region start. */
-    inline void openTimingRegion(CommandStream /*unused*/) {}
+    inline void openTimingRegion(const DeviceStream& /*unused*/) {}
      /*! \brief Should be called after the region end. */
-    inline void closeTimingRegion(CommandStream /*unused*/) {}
+    inline void closeTimingRegion(const DeviceStream& /*unused*/) {}
      /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */
      inline double getLastRangeTime()
      {
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh

index b477cdcb4ce4b5b5bbbf6a20efaeb8df28bf7f25..ec3424a8f448537cece8e866ad3614a75157fc57 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -45,6 +45,8 @@
   * \ingroup module_gpu_utils
   */
  
+#include "gromacs/gpu_utils/device_stream.h"
+
  /*! \brief CUDA device information.
   *
   * The CUDA device information is queried and set at detection and contains
@@ -61,8 +63,6 @@ struct DeviceInformation
      int stat;
  };
  
-//! \brief GPU command stream
-using CommandStream = cudaStream_t;
  //! \brief Single GPU call timing event - meaningless in CUDA
  using CommandEvent = void;
  
@@ -73,10 +73,10 @@ using CommandEvent = void;
   */
  struct KernelLaunchConfig
  {
-    size_t        gridSize[3]      = { 1, 1, 1 }; //!< Block counts
-    size_t        blockSize[3]     = { 1, 1, 1 }; //!< Per-block thread counts
-    size_t        sharedMemorySize = 0;           //!< Shared memory size in bytes
-    CommandStream stream           = nullptr;     //!< Stream to launch kernel in
+    size_t       gridSize[3]      = { 1, 1, 1 }; //!< Block counts
+    size_t       blockSize[3]     = { 1, 1, 1 }; //!< Per-block thread counts
+    size_t       sharedMemorySize = 0;           //!< Shared memory size in bytes
+    cudaStream_t stream           = nullptr;     //!< Stream to launch kernel in
  };
  
  //! Sets whether device code can use arrays that are embedded in structs.
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h

index a36a5cc3bcf5b8ad3db267a42f873c4547712147..5fec00303adc59588557726d188efe5d6d9e2f84 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -63,8 +63,6 @@ struct DeviceInformation
      // No member needed
  };
  
-//! \brief GPU command stream
-using CommandStream = void*;
  //! \brief Single GPU call timing event
  using CommandEvent = void*;
  
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h

index caf837552a453222ede9f11df25f7bc3a9987353..0438c084d1a5348522bca08c6fecdc864e23c846 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -79,8 +79,6 @@ struct DeviceInformation
      size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE).
  };
  
-//! \brief GPU command stream
-using CommandStream = cl_command_queue;
  //! \brief Single GPU call timing event
  using CommandEvent = cl_event;
  
@@ -91,10 +89,10 @@ using CommandEvent = cl_event;
   */
  struct KernelLaunchConfig
  {
-    size_t        gridSize[3]      = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
-    size_t        blockSize[3]     = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
-    size_t        sharedMemorySize = 0;           //!< Shared memory size in bytes
-    CommandStream stream           = nullptr;     //!< Stream to launch kernel in
+    size_t           gridSize[3]      = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
+    size_t           blockSize[3]     = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
+    size_t           sharedMemorySize = 0;           //!< Shared memory size in bytes
+    cl_command_queue stream           = nullptr;     //!< Stream to launch kernel in
  };
  
  /*! \brief Sets whether device code can use arrays that are embedded in structs.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 90f5b04bfe88888664627eb33f7b30ed35421aa2..5e36d91ffd8309fa93f81945f42d54c225b4cdab 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -45,6 +45,7 @@
  #include <string>
  
  #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/gmxopencl.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/utility/exceptions.h"
@@ -130,17 +131,6 @@ void pfree(void* h_ptr);
  /*! \brief Convert error code to diagnostic string */
  std::string ocl_get_error_string(cl_int error);
  
-/*! \brief Calls clFinish() in the stream \p s.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cl_command_queue s)
-{
-    cl_int cl_error = clFinish(s);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
-                       ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
-}
-
  //! A debug checker to track cl_events being released correctly
  inline void ensureReferenceCount(const cl_event& event, unsigned int refCount)
  {
@@ -156,11 +146,9 @@ inline void ensureReferenceCount(const cl_event& event, unsigned int refCount)
  
  /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
   *
- * \param[in] s queue to check
- *
- *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ *  \returns  Not implemented in OpenCL.
   */
-static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */)
  {
      GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
      return false;
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu

index e16dd8ebf546bd201fc40ab3fe8ad532a3b2b47a..682035bb5f460ab9610eefd17dc4d4458c3aaa24 100644 (file)
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -112,12 +112,13 @@ void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std
  {
      DeviceInformation   deviceInfo;
      const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
  
      const int numElements = h_rVecInput.size();
  
      DeviceBuffer<RVec> d_rVecInput;
      allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
-    copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
+    copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, deviceStream,
                         GpuApiCallBehavior::Sync, nullptr);
  
      DeviceBuffer<float3> d_float3Output;
@@ -131,14 +132,14 @@ void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std
      kernelLaunchConfig.blockSize[1]     = 1;
      kernelLaunchConfig.blockSize[2]     = 1;
      kernelLaunchConfig.sharedMemorySize = 0;
-    kernelLaunchConfig.stream           = nullptr;
+    kernelLaunchConfig.stream           = deviceStream.stream();
  
      auto       kernelPtr  = convertRVecToFloat3OnDevice_kernel;
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig,
                                                        &d_float3Output, &d_rVecInput, &numElements);
      launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs);
  
-    copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, nullptr,
+    copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream,
                           GpuApiCallBehavior::Sync, nullptr);
  
      saveFloat3InRVecFormat(h_rVecOutput, h_float3Output.data(), numElements);
diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h

index b1c69d45721fd00ec7aa8b67e45578e93a479fb9..e2c114ce4bf65b98fbde7e702c36f9777127fc8b 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded.h
+++ b/src/gromacs/listed_forces/gpubonded.h
@@ -56,6 +56,8 @@
  #include "gromacs/utility/classhelpers.h"
  
  class DeviceContext;
+class DeviceStream;
+
  struct gmx_enerdata_t;
  struct gmx_ffparams_t;
  struct gmx_mtop_t;
@@ -109,7 +111,7 @@ public:
      //! Construct the manager with constant data and the stream to use.
      GpuBonded(const gmx_ffparams_t& ffparams,
                const DeviceContext&  deviceContext,
-              void*                 streamPtr,
+              const DeviceStream&   deviceStream,
                gmx_wallcycle*        wcycle);
      //! Destructor
      ~GpuBonded();
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp

index f24103229be93815e61855a81288dfa7c8572cd4..ec8e43323eac98f93617808da2bb0fd307451f4b 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -162,7 +162,7 @@ class GpuBonded::Impl
  
  GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */,
                       const DeviceContext& /* deviceContext */,
-                     void* /*streamPtr */,
+                     const DeviceStream& /* deviceStream */,
                       gmx_wallcycle* /* wcycle */) :
      impl_(nullptr)
  {
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index 763550c5c9e52d891cf574a92edb09c1bfc92b59..0d5367f698f54e46a8e13a112dc852130f1fa2a2 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -66,22 +66,22 @@ namespace gmx
  
  GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
                        const DeviceContext&  deviceContext,
-                      void*                 streamPtr,
+                      const DeviceStream&   deviceStream,
                        gmx_wallcycle*        wcycle) :
-    deviceContext_(deviceContext)
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream)
  {
-    stream_ = *static_cast<CommandStream*>(streamPtr);
      wcycle_ = wcycle;
  
      allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
      // This could be an async transfer (if the source is pinned), so
      // long as it uses the same stream as the kernels and we are happy
      // to consume additional pinned pages.
-    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
-                       GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(),
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
      vTot_.resize(F_NRE);
      allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
  
      kernelParams_.d_forceParams = d_forceParams_;
      kernelParams_.d_xq          = d_xq_;
@@ -212,7 +212,7 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
              reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
                                     deviceContext_);
  
-            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
+            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), deviceStream_,
                                 GpuApiCallBehavior::Async, nullptr);
          }
          kernelParams_.fTypesOnGpu[fTypesCounter]    = fType;
@@ -270,7 +270,7 @@ void GpuBonded::Impl::launchEnergyTransfer()
      wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
      // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
      float* h_vTot = vTot_.data();
-    copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, stream_, GpuApiCallBehavior::Async, nullptr);
+    copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
  }
  
@@ -281,7 +281,7 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
                 "accumulation should not occur");
  
      wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
-    cudaError_t stat = cudaStreamSynchronize(stream_);
+    cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
      CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
      wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
  
@@ -304,7 +304,7 @@ void GpuBonded::Impl::clearEnergies()
  {
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
@@ -313,9 +313,9 @@ void GpuBonded::Impl::clearEnergies()
  
  GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams,
                       const DeviceContext&  deviceContext,
-                     void*                 streamPtr,
+                     const DeviceStream&   deviceStream,
                       gmx_wallcycle*        wcycle) :
-    impl_(new Impl(ffparams, deviceContext, streamPtr, wcycle))
+    impl_(new Impl(ffparams, deviceContext, deviceStream, wcycle))
  {
  }
  
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h

index a0da918893fa0340fc2b78bc81967c80e528b20f..32cce2599aae46e2d2ce21f05403d9966cad262d 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -126,7 +126,10 @@ class GpuBonded::Impl
  {
  public:
      //! Constructor
-    Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, void* streamPtr, gmx_wallcycle* wcycle);
+    Impl(const gmx_ffparams_t& ffparams,
+         const DeviceContext&  deviceContext,
+         const DeviceStream&   deviceStream,
+         gmx_wallcycle*        wcycle);
      /*! \brief Destructor, non-default needed for freeing
       * device-side buffers */
      ~Impl();
@@ -183,7 +186,7 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! \brief Bonded GPU stream, not owned by this module
-    CommandStream stream_;
+    const DeviceStream& deviceStream_;
  
      //! Parameters and pointers, passed to the CUDA kernel
      BondedCudaKernelParameters kernelParams_;
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu

index 166e5bb87d12da0d56727dc03ac023a5b8fc88c9..e03a3f1fa1b3683a170ccb0943a13423f9684ff3 100644 (file)
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -850,7 +850,7 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
      config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
      config.gridSize[1]  = 1;
      config.gridSize[2]  = 1;
-    config.stream       = stream_;
+    config.stream       = deviceStream_.stream();
  
      auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
      kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu

index b77162c1af47f363499338346efc02826f8057af..a7e19c922d40b65ce4f8dc44a376ecb1031c870c 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -287,7 +287,7 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
                  h_lambdas_[i] = tcstat[i].lambda;
              }
              copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
-                               commandStream_, GpuApiCallBehavior::Async, nullptr);
+                               deviceStream_, GpuApiCallBehavior::Async, nullptr);
          }
          VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
          if (doParrinelloRahman)
@@ -316,9 +316,9 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
      return;
  }
  
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
      deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
  {
      numAtoms_ = 0;
  
@@ -328,7 +328,7 @@ LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream comma
      kernelLaunchConfig_.blockSize[1]     = 1;
      kernelLaunchConfig_.blockSize[2]     = 1;
      kernelLaunchConfig_.sharedMemorySize = 0;
-    kernelLaunchConfig_.stream           = commandStream_;
+    kernelLaunchConfig_.stream           = deviceStream_.stream();
  }
  
  LeapFrogGpu::~LeapFrogGpu()
@@ -345,7 +345,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
  
      reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
                             &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
+    copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, deviceStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  
      // Temperature scale group map only used if there are more then one group
@@ -353,7 +353,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
      {
          reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
                                 &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
+        copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_,
                             GpuApiCallBehavior::Sync, nullptr);
      }
  
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh

index 26a6fc7399a4c373dcacbcad524f2198d8e91150..6097a9a87e0abd6244248c8d87a202cb2d08a4d6 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cuh
+++ b/src/gromacs/mdlib/leapfrog_gpu.cuh
@@ -64,9 +64,9 @@ public:
      /*! \brief Constructor.
       *
       * \param[in] deviceContext  Device context (dummy in CUDA).
-     * \param[in] commandStream  Device command stream to use.
+     * \param[in] deviceStream   Device stream to use.
       */
-    LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
+    LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
      ~LeapFrogGpu();
  
      /*! \brief Integrate
@@ -115,7 +115,7 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
      //! GPU kernel launch config
      KernelLaunchConfig kernelLaunchConfig_;
      //! Number of atoms
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu

index 03c1bd1d15ed4f5dfcab5d91ae2c3e3b5ee11239..61bc717852cdc2677fdf463148e9e68c50df53b0 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -447,7 +447,7 @@ void LincsGpu::apply(const float3* d_x,
      {
          // Fill with zeros so the values can be reduced to it
          // Only 6 values are needed because virial is symmetrical
-        clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_);
+        clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
      }
  
      auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
@@ -475,7 +475,7 @@ void LincsGpu::apply(const float3* d_x,
      {
          config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
      }
-    config.stream = commandStream_;
+    config.stream = deviceStream_.stream();
  
      kernelParams_.pbcAiuc = pbcAiuc;
  
@@ -488,7 +488,7 @@ void LincsGpu::apply(const float3* d_x,
      {
          // Copy LINCS virial data and add it to the common virial
          copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
-                             commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                             deviceStream_, GpuApiCallBehavior::Sync, nullptr);
  
          // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
          virialScaled[XX][XX] += h_virialScaled_[0];
@@ -510,9 +510,9 @@ void LincsGpu::apply(const float3* d_x,
  LincsGpu::LincsGpu(int                  numIterations,
                     int                  expansionOrder,
                     const DeviceContext& deviceContext,
-                   CommandStream        commandStream) :
+                   const DeviceStream&  deviceStream) :
      deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
  {
      kernelParams_.numIterations  = numIterations;
      kernelParams_.expansionOrder = expansionOrder;
@@ -943,23 +943,23 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
  
      // Copy data to GPU.
      copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
-                       kernelParams_.numConstraintsThreads, commandStream_,
-                       GpuApiCallBehavior::Sync, nullptr);
+                       kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync,
+                       nullptr);
      copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
                         constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
                         coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
                         0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
-                       maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
+                       maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  
      GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
-    copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
+    copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, deviceStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  }
  
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh

index 4817573b8098806b24e8a1216ef441aaa8f01c81..ef035164318a50bb6397f0daecef1c14e7aaca06 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -105,9 +105,12 @@ public:
       * \param[in] numIterations    Number of iteration for the correction of the projection.
       * \param[in] expansionOrder   Order of the matrix inversion algorithm.
       * \param[in] deviceContext    Device context (dummy in CUDA).
-     * \param[in] commandStream    Device command stream.
+     * \param[in] deviceStream     Device command stream.
       */
-    LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
+    LincsGpu(int                  numIterations,
+             int                  expansionOrder,
+             const DeviceContext& deviceContext,
+             const DeviceStream&  deviceStream);
      /*! \brief Destructor.*/
      ~LincsGpu();
  
@@ -172,7 +175,7 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
  
      //! Parameters and pointers, passed to the GPU kernel
      LincsGpuKernelParameters kernelParams_;
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu

index 20933baf965604f7d72e04f5dd02a12ae3bbeeca..76daf34c1acf718f5942dc52812471ed137d9683 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -434,7 +434,7 @@ void SettleGpu::apply(const float3* d_x,
      {
          // Fill with zeros so the values can be reduced to it
          // Only 6 values are needed because virial is symmetrical
-        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, commandStream_);
+        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_);
      }
  
      auto kernelPtr = getSettleKernelPtr(updateVelocities, computeVirial);
@@ -455,7 +455,7 @@ void SettleGpu::apply(const float3* d_x,
      {
          config.sharedMemorySize = 0;
      }
-    config.stream = commandStream_;
+    config.stream = deviceStream_.stream();
  
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_,
                                                        &settleParameters_, &d_x, &d_xp, &invdt, &d_v,
@@ -465,7 +465,7 @@ void SettleGpu::apply(const float3* d_x,
  
      if (computeVirial)
      {
-        copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, commandStream_,
+        copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_,
                               GpuApiCallBehavior::Sync, nullptr);
  
          // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
@@ -485,9 +485,9 @@ void SettleGpu::apply(const float3* d_x,
      return;
  }
  
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
      deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
  {
      static_assert(sizeof(real) == sizeof(float),
                    "Real numbers should be in single precision in GPU code.");
@@ -622,7 +622,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus
          settler.z        = iatoms[i * nral1 + 3]; // Second hydrogen index
          h_atomIds_.at(i) = settler;
      }
-    copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, commandStream_,
+    copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  }
  
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh

index da8bafd8dfbf45ae0dcbcb13874525fa483da6a4..24584f7a4bb0065f2de54756cad0a347e9da73e8 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -202,9 +202,9 @@ public:
       *                           target O-H and H-H distances. These values are also checked for
       *                           consistency.
       * \param[in] deviceContext  Device context (dummy in CUDA).
-     * \param[in] commandStream  Device stream to use.
+     * \param[in] deviceStream   Device stream to use.
       */
-    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
+    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
  
      ~SettleGpu();
  
@@ -255,7 +255,7 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
  
      //! Scaled virial tensor (9 reals, GPU)
      std::vector<float> h_virialScaled_;
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu

index 5c0a007ee4461f96ca4b9298c29391be3041d38f..00672af606fa99150ec9d3e3f5b1303e536d2769 100644 (file)
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -72,9 +72,10 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
  {
      DeviceInformation   deviceInfo;
      const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
  
      auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
-                                               deviceContext, nullptr);
+                                               deviceContext, deviceStream);
  
      bool    updateVelocities = true;
      int     numAtoms         = testData->numAtoms_;
@@ -88,23 +89,23 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
      allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
      allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
  
-    copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
+    copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream,
                         GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, nullptr,
+    copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream,
                         GpuApiCallBehavior::Sync, nullptr);
      if (updateVelocities)
      {
-        copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, nullptr,
+        copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream,
                             GpuApiCallBehavior::Sync, nullptr);
      }
      lincsGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_,
                      testData->virialScaled_, pbcAiuc);
  
-    copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, nullptr,
+    copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream,
                           GpuApiCallBehavior::Sync, nullptr);
      if (updateVelocities)
      {
-        copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, nullptr,
+        copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream,
                               GpuApiCallBehavior::Sync, nullptr);
      }
  
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu

index b794149ddb8a34fb51ebaf103be82f0428b32ffa..7b2e22aac2b8e96262eb30613d447e528aff22a1 100644 (file)
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
@@ -68,6 +68,7 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
  {
      DeviceInformation   deviceInfo;
      const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
  
      int numAtoms = testData->numAtoms_;
  
@@ -83,12 +84,12 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
      allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
      allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
  
-    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
  
-    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
+    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, deviceStream);
  
      integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
  
@@ -104,8 +105,8 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
                                testData->dtPressureCouple_, testData->velocityScalingMatrix_);
      }
  
-    copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
  
      freeDeviceBuffer(&d_x);
      freeDeviceBuffer(&d_xp);
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu

index 6ebc6688da98aa422cdfb67e9686bec66c7ca788..741d2951aa51451f4c6565a62561e0f2c41fd922 100644 (file)
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -88,8 +88,9 @@ void applySettleGpu(SettleTestData*  testData,
  
      DeviceInformation   deviceInfo;
      const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
  
-    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
+    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, deviceStream);
  
      settleGpu->set(*testData->idef_, testData->mdatoms_);
      PbcAiuc pbcAiuc;
@@ -107,19 +108,20 @@ void applySettleGpu(SettleTestData*  testData,
      allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
      allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
  
-    copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
      if (updateVelocities)
      {
-        copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+        copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
      }
      settleGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial,
                       testData->virial_, pbcAiuc);
  
-    copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
      if (updateVelocities)
      {
-        copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+        copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, deviceStream,
+                             GpuApiCallBehavior::Sync, nullptr);
      }
  
      freeDeviceBuffer(&d_x);
diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h

index 61f8537efa24bc052851f016b86f8583bfb0a087..c0a5e9b21aa4c607361d1d4fabe081ef365b67cb 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu.h
+++ b/src/gromacs/mdlib/update_constrain_gpu.h
@@ -50,8 +50,8 @@
  #include "gromacs/utility/classhelpers.h"
  
  class DeviceContext;
+class DeviceStream;
  class GpuEventSynchronizer;
-
  struct gmx_mtop_t;
  enum class PbcType : int;
  class InteractionDefinitions;
@@ -68,7 +68,7 @@ class UpdateConstrainGpu
  public:
      /*! \brief Create Update-Constrain object.
       *
-     * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+     * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
       * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
       * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
       * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
@@ -79,13 +79,13 @@ public:
       * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
       *                              and target O-H and H-H distances from this object.
       * \param[in] deviceContext     GPU device context.
-     * \param[in] commandStream     GPU stream to use. Can be nullptr.
+     * \param[in] deviceStream      GPU stream to use.
       * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
       */
      UpdateConstrainGpu(const t_inputrec&     ir,
                         const gmx_mtop_t&     mtop,
                         const DeviceContext&  deviceContext,
-                       const void*           commandStream,
+                       const DeviceStream&   deviceStream,
                         GpuEventSynchronizer* xUpdatedOnDevice);
  
      ~UpdateConstrainGpu();
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp

index 45a0743384e36e00108f9b4ef4fe5fe7106789df..76f3a0eedf6b295c0a7878cddf48fb86848bf611 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -58,7 +58,7 @@ class UpdateConstrainGpu::Impl
  UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
                                         const gmx_mtop_t& /* mtop */,
                                         const DeviceContext& /* deviceContext */,
-                                       const void* /* commandStream */,
+                                       const DeviceStream& /* deviceStream */,
                                         GpuEventSynchronizer* /* xUpdatedOnDevice */) :
      impl_(nullptr)
  {
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu

index 41f75723324aca24e314dade2f70eae39756a43c..eed9e44d6334bf34833655d03f2508d5a9276022 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -119,7 +119,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
      clear_mat(virial);
  
      // Make sure that the forces are ready on device before proceeding with the update.
-    fReadyOnDevice->enqueueWaitEvent(commandStream_);
+    fReadyOnDevice->enqueueWaitEvent(deviceStream_);
  
      // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
      // into d_x_. The d_xp_ is only needed by constraints.
@@ -141,7 +141,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
          }
      }
  
-    coordinatesReady_->markEvent(commandStream_);
+    coordinatesReady_->markEvent(deviceStream_);
  
      return;
  }
@@ -162,31 +162,30 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
                      "scaleCoordinates_kernel", kernelArgs);
      // TODO: Although this only happens on the pressure coupling steps, this synchronization
      //       can affect the perfornamce if nstpcouple is small.
-    gpuStreamSynchronize(commandStream_);
+    deviceStream_.synchronize();
  }
  
  UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                                 const gmx_mtop_t&     mtop,
                                 const DeviceContext&  deviceContext,
-                               const void*           commandStream,
+                               const DeviceStream&   deviceStream,
                                 GpuEventSynchronizer* xUpdatedOnDevice) :
      deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
      coordinatesReady_(xUpdatedOnDevice)
  {
      GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
-    commandStream != nullptr ? commandStream_ = *static_cast<const CommandStream*>(commandStream)
-                             : commandStream_ = nullptr;
  
  
-    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
-    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
-    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
+    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, deviceStream_);
+    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_);
+    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, deviceStream_);
  
      coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
      coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
      coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
      coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
-    coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
+    coordinateScalingKernelLaunchConfig_.stream           = deviceStream_.stream();
  }
  
  UpdateConstrainGpu::Impl::~Impl() {}
@@ -235,9 +234,9 @@ GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync()
  UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
                                         const gmx_mtop_t&     mtop,
                                         const DeviceContext&  deviceContext,
-                                       const void*           commandStream,
+                                       const DeviceStream&   deviceStream,
                                         GpuEventSynchronizer* xUpdatedOnDevice) :
-    impl_(new Impl(ir, mtop, deviceContext, commandStream, xUpdatedOnDevice))
+    impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice))
  {
  }
  
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h

index dd46010e93015920529d7100fce04579ddcca594..8aacc28a610d5d84fa1203578749519b875cf359 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -65,7 +65,7 @@ class UpdateConstrainGpu::Impl
  public:
      /*! \brief Create Update-Constrain object.
       *
-     * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+     * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
       * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
       * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
       * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
@@ -76,13 +76,13 @@ public:
       * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
       *                              and target O-H and H-H distances from this object.
       * \param[in] deviceContext     GPU device context.
-     * \param[in] commandStream     GPU stream to use. Can be nullptr.
+     * \param[in] deviceStream      GPU stream to use.
       * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
       */
      Impl(const t_inputrec&     ir,
           const gmx_mtop_t&     mtop,
           const DeviceContext&  deviceContext,
-         const void*           commandStream,
+         const DeviceStream&   deviceStream,
           GpuEventSynchronizer* xUpdatedOnDevice);
  
      ~Impl();
@@ -171,7 +171,7 @@ private:
      //! GPU context object
      const DeviceContext& deviceContext_;
      //! GPU stream
-    CommandStream commandStream_ = nullptr;
+    const DeviceStream& deviceStream_;
      //! GPU kernel launch config
      KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
  
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 941a7030c9eca738ea4dee3201bbefd4c0628db7..91360ccd28cb96e0b8e9f22100e652d4b60a18a3 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -403,9 +403,10 @@ void gmx::LegacySimulator::do_md()
  
          GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
                             "GPU device context should be initialized to use GPU update.");
-
+        GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
+                           "Update stream can not be nullptr when update is on a GPU.");
          integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
-                                                          stateGpu->getUpdateStream(),
+                                                          *stateGpu->getUpdateStream(),
                                                            stateGpu->xUpdatedOnDevice());
  
          integrator->setPbc(PbcType::Xyz, state->box);
@@ -867,14 +868,20 @@ void gmx::LegacySimulator::do_md()
                      && useGpuForNonbonded && is1D(*cr->dd))
                  {
                      // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-                    void* streamLocal =
+                    const DeviceStream* localStream =
                              Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-                    void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+                    const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
                              fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
                      GMX_RELEASE_ASSERT(
                              fr->deviceContext != nullptr,
                              "GPU device context should be initialized to use GPU halo exchange.");
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal);
+                    GMX_RELEASE_ASSERT(localStream != nullptr,
+                                       "Local non-bonded stream can't be nullptr when using GPU "
+                                       "halo exchange.");
+                    GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
+                                       "Non-local non-bonded stream can't be nullptr when using "
+                                       "GPU halo exchange.");
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
                  }
              }
          }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 96e157ca07d0c06f863de0ec782778e6e8750ba3..753a43ab3553aea38a798f3502d496da4fcdc006 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1373,7 +1373,9 @@ int Mdrunner::mdrunner()
              GMX_RELEASE_ASSERT(
                      fr->deviceContext != nullptr,
                      "Device context can not be nullptr when computing bonded interactions on GPU.");
-            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, stream, wcycle);
+            GMX_RELEASE_ASSERT(stream != nullptr,
+                               "Can'r run GPU version of bonded forces in nullptr stream.");
+            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
              fr->gpuBonded = gpuBonded.get();
          }
  
@@ -1584,12 +1586,12 @@ int Mdrunner::mdrunner()
              && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
                  || runScheduleWork.simulationWork.useGpuBufferOps))
          {
-            const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
-            const void* localStream =
+            const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
+            const DeviceStream* localStream =
                      fr->nbv->gpu_nbv != nullptr
                              ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
                              : nullptr;
-            const void* nonLocalStream =
+            const DeviceStream* nonLocalStream =
                      fr->nbv->gpu_nbv != nullptr
                              ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                              : nullptr;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 034e7eb604d1e0c6348b8dd8a45c015b30527315..678fa3368151ab335fd984215071ea72f874757b 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -60,6 +60,7 @@
  #include "locality.h"
  
  class DeviceContext;
+class DeviceStream;
  class GpuEventSynchronizer;
  struct gmx_wallcycle;
  
@@ -99,9 +100,6 @@ public:
       *       \p pmeStream argument needs to be a valid OpenCL queue object
       *       which must have been created in \p deviceContext.
       *
-     * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-     *       will not have to pass a void*.
-     *
       *  \param[in] pmeStream       Device PME stream, nullptr allowed.
       *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
       *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
@@ -110,9 +108,9 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*          pmeStream,
-                           const void*          localStream,
-                           const void*          nonLocalStream,
+    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
+                           const DeviceStream*  localStream,
+                           const DeviceStream*  nonLocalStream,
                             const DeviceContext& deviceContext,
                             GpuApiCallBehavior   transferKind,
                             int                  paddingSize,
@@ -134,7 +132,7 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*          pmeStream,
+    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                             const DeviceContext& deviceContext,
                             GpuApiCallBehavior   transferKind,
                             int                  paddingSize,
@@ -329,7 +327,7 @@ public:
       *
       *  \returns The device command stream to use in update-constraints.
       */
-    void* getUpdateStream();
+    const DeviceStream* getUpdateStream();
  
      /*! \brief Getter for the number of local atoms.
       *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 1029dd220fa12c72480c338d6cb28d14864d2f54..78b1fd3a4ad13962756121af39078114ad7186ee 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -54,9 +54,9 @@ class StatePropagatorDataGpu::Impl
  {
  };
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
-                                               const void* /* localStream     */,
-                                               const void* /* nonLocalStream  */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
+                                               const DeviceStream* /* localStream     */,
+                                               const DeviceStream* /* nonLocalStream  */,
                                                 const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
                                                 int /* paddingSize     */,
@@ -65,7 +65,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */
  {
  }
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
                                                 const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
                                                 int /* paddingSize     */,
@@ -242,7 +242,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality /* atomLocality
  }
  
  
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
  {
      GMX_ASSERT(false,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 679bf2544a81e082f124e568272da78cdb3e9cd7..1b2c91d2e2bf5ead86a0fca68d9e1095dc92a47e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -99,9 +99,6 @@ public:
       *       \p pmeStream argument needs to be a valid OpenCL queue object
       *       which must have been created in \p deviceContext.
       *
-     * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-     *       will not have to pass a void*.
-     *
       *  \param[in] pmeStream       Device PME stream, nullptr allowed.
       *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
       *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
@@ -110,9 +107,9 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    Impl(const void*          pmeStream,
-         const void*          localStream,
-         const void*          nonLocalStream,
+    Impl(const DeviceStream*  pmeStream,
+         const DeviceStream*  localStream,
+         const DeviceStream*  nonLocalStream,
           const DeviceContext& deviceContext,
           GpuApiCallBehavior   transferKind,
           int                  paddingSize,
@@ -134,7 +131,7 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    Impl(const void*          pmeStream,
+    Impl(const DeviceStream*  pmeStream,
           const DeviceContext& deviceContext,
           GpuApiCallBehavior   transferKind,
           int                  paddingSize,
@@ -325,7 +322,7 @@ public:
       *
       *  \returns The device command stream to use in update-constraints.
       */
-    void* getUpdateStream();
+    const DeviceStream* getUpdateStream();
  
      /*! \brief Getter for the number of local atoms.
       *
@@ -341,20 +338,23 @@ public:
  
  private:
      //! GPU PME stream.
-    CommandStream pmeStream_ = nullptr;
+    const DeviceStream* pmeStream_;
      //! GPU NBNXM local stream.
-    CommandStream localStream_ = nullptr;
-    //! GPU NBNXM non-local stream
-    CommandStream nonLocalStream_ = nullptr;
+    const DeviceStream* localStream_;
+    //! GPU NBNXM non-local stream.
+    const DeviceStream* nonLocalStream_;
      //! GPU Update-constreaints stream.
-    CommandStream updateStream_ = nullptr;
+    const DeviceStream* updateStream_;
+
+    //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
+    DeviceStream updateStreamOwn_;
  
      // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
      // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> vCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
      // Streams to use for forces H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
  
      /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
       *
@@ -422,13 +422,13 @@ private:
       *  \param[in]  h_data         Host-side buffer.
       *  \param[in]  dataSize       Device-side data allocation size.
       *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-     *  \param[in]  commandStream  GPU stream to execute copy in.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
       */
      void copyToDevice(DeviceBuffer<RVec>             d_data,
                        gmx::ArrayRef<const gmx::RVec> h_data,
                        int                            dataSize,
                        AtomLocality                   atomLocality,
-                      CommandStream                  commandStream);
+                      const DeviceStream&            deviceStream);
  
      /*! \brief Performs the copy of data from device to host buffer.
       *
@@ -436,13 +436,13 @@ private:
       *  \param[in]  d_data         Device-side buffer.
       *  \param[in]  dataSize       Device-side data allocation size.
       *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-     *  \param[in]  commandStream  GPU stream to execute copy in.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
       */
      void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
                          DeviceBuffer<RVec>       d_data,
                          int                      dataSize,
                          AtomLocality             atomLocality,
-                        CommandStream            commandStream);
+                        const DeviceStream&      deviceStream);
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index d88f469711dcc5343560df94729a8455e7363d23..b1fefd34a5809ec56a05281a9669afb02cdb79b0 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -65,9 +65,9 @@
  namespace gmx
  {
  
-StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
-                                   const void*          localStream,
-                                   const void*          nonLocalStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
+                                   const DeviceStream*  localStream,
+                                   const DeviceStream*  nonLocalStream,
                                     const DeviceContext& deviceContext,
                                     GpuApiCallBehavior   transferKind,
                                     int                  paddingSize,
@@ -86,8 +86,8 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
          GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
  
          // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_    = *static_cast<const CommandStream*>(pmeStream);
-        updateStream_ = *static_cast<const CommandStream*>(pmeStream);
+        pmeStream_    = pmeStream;
+        updateStream_ = pmeStream;
          GMX_UNUSED_VALUE(localStream);
          GMX_UNUSED_VALUE(nonLocalStream);
      }
@@ -96,21 +96,24 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
      {
          if (pmeStream != nullptr)
          {
-            pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+            pmeStream_ = pmeStream;
          }
          if (localStream != nullptr)
          {
-            localStream_ = *static_cast<const CommandStream*>(localStream);
+            localStream_ = localStream;
          }
          if (nonLocalStream != nullptr)
          {
-            nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
+            nonLocalStream_ = nonLocalStream;
          }
  
          // TODO: The update stream should be created only when it is needed.
  #    if (GMX_GPU == GMX_GPU_CUDA)
-        cudaError_t stat;
-        stat = cudaStreamCreate(&updateStream_);
+        cudaError_t  stat;
+        cudaStream_t stream;
+        stat = cudaStreamCreate(&stream);
+        updateStreamOwn_.setStream(stream);
+        updateStream_ = &updateStreamOwn_;
          CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
  #    endif
      }
@@ -131,7 +134,7 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
      fCopyStreams_[AtomLocality::All]      = updateStream_;
  }
  
-StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                     const DeviceContext& deviceContext,
                                     GpuApiCallBehavior   transferKind,
                                     int                  paddingSize,
@@ -145,9 +148,8 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
                    "This object should only be constructed on the GPU code-paths.");
  
      GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
-    pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
-
-    localStream_    = nullptr;
+    pmeStream_      = pmeStream;
+    localStream_    = pmeStream; // For clearing the force buffer
      nonLocalStream_ = nullptr;
      updateStream_   = nullptr;
  
@@ -193,7 +195,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      if (paddingAllocationSize > 0)
      {
          // The PME stream is used here because the padding region of d_x_ is only in the PME task.
-        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
+        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
      }
  
      reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
@@ -204,7 +206,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      // since the force buffer ops are not implemented in OpenCL.
      if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
      {
-        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -247,7 +249,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
                                                  const gmx::ArrayRef<const gmx::RVec> h_data,
                                                  int                                  dataSize,
                                                  AtomLocality                         atomLocality,
-                                                CommandStream                        commandStream)
+                                                const DeviceStream&                  deviceStream)
  {
      GMX_UNUSED_VALUE(dataSize);
  
@@ -255,7 +257,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
  
-    GMX_ASSERT(commandStream != nullptr,
+    GMX_ASSERT(deviceStream.stream() != nullptr,
                 "No stream is valid for copying with given atom locality.");
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -271,7 +273,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
                     "The host buffer is smaller than the requested copy range.");
  
          copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
-                           atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+                           atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -282,7 +284,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
                                                    DeviceBuffer<RVec>       d_data,
                                                    int                      dataSize,
                                                    AtomLocality             atomLocality,
-                                                  CommandStream            commandStream)
+                                                  const DeviceStream&      deviceStream)
  {
      GMX_UNUSED_VALUE(dataSize);
  
@@ -290,7 +292,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
  
-    GMX_ASSERT(commandStream != nullptr,
+    GMX_ASSERT(deviceStream.stream() != nullptr,
                 "No stream is valid for copying with given atom locality.");
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -306,7 +308,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
                     "The host buffer is smaller than the requested copy range.");
  
          copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
-                             atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+                             atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -322,14 +324,14 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
                                                          AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = xCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying positions with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
  
      // markEvent is skipped in OpenCL as:
      //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
@@ -337,7 +339,7 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
      // TODO: remove this by adding an event-mark free flavor of this function
      if (GMX_GPU == GMX_GPU_CUDA)
      {
-        xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
+        xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -387,16 +389,16 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
  void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = xCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying positions with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
      // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
-    xReadyOnHost_[atomLocality].markEvent(commandStream);
+    xReadyOnHost_[atomLocality].markEvent(*deviceStream);
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -419,15 +421,15 @@ void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const
                                                         AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = vCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying velocities with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
-    vReadyOnDevice_[atomLocality].markEvent(commandStream);
+    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
+    vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -442,15 +444,15 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEv
  void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = vCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying velocities with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
-    vReadyOnHost_[atomLocality].markEvent(commandStream);
+    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
+    vReadyOnHost_[atomLocality].markEvent(*deviceStream);
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -473,15 +475,15 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
                                                     AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = fCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying forces with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
-    fReadyOnDevice_[atomLocality].markEvent(commandStream);
+    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
+    fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -508,15 +510,15 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
  void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = fCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying forces with given atom locality.");
  
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
-    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
-    fReadyOnHost_[atomLocality].markEvent(commandStream);
+    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
+    fReadyOnHost_[atomLocality].markEvent(*deviceStream);
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -529,9 +531,9 @@ void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocali
      wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
  }
  
-void* StatePropagatorDataGpu::Impl::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
  {
-    return &updateStream_;
+    return updateStream_;
  }
  
  int StatePropagatorDataGpu::Impl::numAtomsLocal()
@@ -545,9 +547,9 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
  }
  
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
-                                               const void*          localStream,
-                                               const void*          nonLocalStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
+                                               const DeviceStream*  localStream,
+                                               const DeviceStream*  nonLocalStream,
                                                 const DeviceContext& deviceContext,
                                                 GpuApiCallBehavior   transferKind,
                                                 int                  paddingSize,
@@ -556,7 +558,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
  {
  }
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                 const DeviceContext& deviceContext,
                                                 GpuApiCallBehavior   transferKind,
                                                 int                  paddingSize,
@@ -682,7 +684,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
  }
  
  
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
  {
      return impl_->getUpdateStream();
  }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index e47834cb9ae8890a76536be3b569faaa0e213846..54ce9f331bd4e8670dd9cea2a5951f79e3d3e226 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
   */
  void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
  {
-    cudaStream_t stream = nb->stream[interactionLocality];
+    const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
  
      /* When we get here all misc operations issued in the local stream as well as
         the local xq H2D are done,
@@ -378,12 +378,13 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal
      {
          if (interactionLocality == InteractionLocality::Local)
          {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
              CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
          }
          else
          {
-            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+            cudaError_t stat =
+                    cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
              CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
          }
      }
@@ -401,10 +402,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  
      int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -440,16 +441,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
      }
  
      cu_copy_H2D_async(adat->xq + adat_begin,
                        static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
-                      adat_len * sizeof(*adat->xq), stream);
+                      adat_len * sizeof(*adat->xq), deviceStream.stream());
  
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
      }
  
      /* When we get here all misc operations issued in the local stream as well as
@@ -480,11 +481,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
   */
  void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
  {
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_nbparam_t*  nbp    = nb->nbparam;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_nbparam_t*       nbp          = nb->nbparam;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -522,7 +523,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
      }
  
      /* Kernel launch config:
@@ -544,7 +545,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
      config.blockSize[2]     = num_threads_z;
      config.gridSize[0]      = nblock;
      config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
  
      if (debug)
      {
@@ -567,13 +568,13 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
  
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
      }
  
      if (GMX_NATIVE_WINDOWS)
      {
          /* Windows: force flushing WDDM queue */
-        cudaStreamQuery(stream);
+        cudaStreamQuery(deviceStream.stream());
      }
  }
  
@@ -592,11 +593,11 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
  
  void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
  {
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_nbparam_t*  nbp    = nb->nbparam;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_nbparam_t*       nbp          = nb->nbparam;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -652,7 +653,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      /* beginning of timed prune calculation section */
      if (bDoTime)
      {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
      }
  
      /* Kernel launch config:
@@ -668,7 +669,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      config.blockSize[2]     = num_threads_z;
      config.gridSize[0]      = nblock;
      config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
  
      if (debug)
      {
@@ -704,13 +705,13 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
  
      if (bDoTime)
      {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
      }
  
      if (GMX_NATIVE_WINDOWS)
      {
          /* Windows: force flushing WDDM queue */
-        cudaStreamQuery(stream);
+        cudaStreamQuery(deviceStream.stream());
      }
  }
  
@@ -728,10 +729,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
  
      /* extract the data */
-    cu_atomdata_t* adat    = nb->atdat;
-    cu_timers_t*   t       = nb->timers;
-    bool           bDoTime = nb->bDoTime;
-    cudaStream_t   stream  = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_timers_t*        t            = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -744,14 +745,14 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      /* beginning of timed D2H section */
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
      }
  
      /* With DD the local D2H transfer can only start after the non-local
         kernel has finished. */
      if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
      {
-        stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
+        stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
      }
  
@@ -761,7 +762,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      if (!stepWork.useGpuFBufferOps)
      {
          cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
-                          (adat_len) * sizeof(*adat->f), stream);
+                          (adat_len) * sizeof(*adat->f), deviceStream.stream());
      }
  
      /* After the non-local D2H is launched the nonlocal_done event can be
@@ -770,7 +771,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         back first. */
      if (iloc == InteractionLocality::NonLocal)
      {
-        stat = cudaEventRecord(nb->nonlocal_done, stream);
+        stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
          CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
      }
  
@@ -780,20 +781,21 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
          /* DtoH fshift when virial is needed */
          if (stepWork.computeVirial)
          {
-            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream);
+            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
+                              deviceStream.stream());
          }
  
          /* DtoH energies */
          if (stepWork.computeEnergy)
          {
-            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream);
-            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), stream);
+            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
+            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
          }
      }
  
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
      }
  }
  
@@ -834,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
      const int                  numAtomsPerCell = grid.numAtomsPerCell();
      Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
  
-    cudaStream_t stream = nb->stream[interactionLoc];
+    const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
  
      int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
      // avoid empty kernel launch, skip to inserting stream dependency
@@ -845,7 +847,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
  
          // ensure that coordinates are ready on the device before launching the kernel
          GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
-        xReadyOnDevice->enqueueWaitEvent(stream);
+        xReadyOnDevice->enqueueWaitEvent(deviceStream);
  
          KernelLaunchConfig config;
          config.blockSize[0] = c_bufOpsThreadsPerBlock;
@@ -858,7 +860,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          GMX_ASSERT(config.gridSize[0] > 0,
                     "Can not have empty grid, early return above avoids this");
          config.sharedMemorySize = 0;
-        config.stream           = stream;
+        config.stream           = deviceStream.stream();
  
          auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
                                          : nbnxn_gpu_x_to_nbat_x_kernel<false>;
@@ -873,7 +875,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
      }
  
-    // TODO: note that this is not necessary when there are no local atoms, that is:
+    // TODO: note that this is not necessary when there astreamre no local atoms, that is:
      // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
      // but for now we avoid that optimization
      nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
@@ -899,9 +901,9 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
      GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
  
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-    cu_atomdata_t*            adat      = nb->atdat;
+    const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
+    const DeviceStream&       deviceStream = nb->deviceStreams[iLocality];
+    cu_atomdata_t*            adat         = nb->atdat;
  
      size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
                                               + static_cast<size_t>((accumulateForce == true));
@@ -911,7 +913,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      // Enqueue wait on all dependencies passed
      for (auto const synchronizer : dependencyList)
      {
-        synchronizer->enqueueWaitEvent(stream);
+        synchronizer->enqueueWaitEvent(deviceStream);
      }
  
      /* launch kernel */
@@ -924,7 +926,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      config.gridSize[1]  = 1;
      config.gridSize[2]  = 1;
      config.sharedMemorySize = 0;
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
  
      auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
                                      : nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
@@ -950,7 +952,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      {
          GMX_ASSERT(nb->localFReductionDone != nullptr,
                     "localFReductionDone has to be a valid pointer");
-        nb->localFReductionDone->markEvent(stream);
+        nb->localFReductionDone->markEvent(deviceStream);
      }
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 666aefc62910b21babf656313f8bd17af9c25385..cbd5f8ce0197b3b434f06c58846f5d392f5384c6 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -448,7 +448,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
      nb->deviceInfo = deviceInfo;
  
      /* local/non-local GPU streams */
-    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+    cudaStream_t localStream;
+    stat = cudaStreamCreate(&localStream);
+    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
      CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
      if (nb->bUseTwoStreams)
      {
@@ -462,8 +464,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
          stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
          CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
  
-        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
-                                            cudaStreamDefault, highest_priority);
+        cudaStream_t nonLocalStream;
+        stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
+        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
          CU_RET_ERR(stat,
                     "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
      }
@@ -512,10 +515,10 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
  
  void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
  {
-    char         sbuf[STRLEN];
-    bool         bDoTime = (nb->bDoTime && !h_plist->sci.empty());
-    cudaStream_t stream  = nb->stream[iloc];
-    cu_plist_t*  d_plist = nb->plist[iloc];
+    char                sbuf[STRLEN];
+    bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    cu_plist_t*         d_plist      = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
      {
@@ -535,18 +538,18 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.pl_h2d.openTimingRegion(deviceStream);
          iTimers.didPairlistH2D = true;
      }
  
      reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                             DeviceContext());
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
                             DeviceContext());
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
@@ -554,12 +557,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
                             &d_plist->excl_nalloc, DeviceContext());
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(deviceStream);
      }
  
      /* the next use of thist list we be the first one, so we need to prune */
@@ -569,7 +572,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -584,7 +587,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -595,7 +598,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -618,13 +621,13 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
  
  void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  {
-    cudaError_t    stat;
-    int            nalloc, natoms;
-    bool           realloced;
-    bool           bDoTime = nb->bDoTime;
-    cu_timers_t*   timers  = nb->timers;
-    cu_atomdata_t* d_atdat = nb->atdat;
-    cudaStream_t   ls      = nb->stream[InteractionLocality::Local];
+    cudaError_t         stat;
+    int                 nalloc, natoms;
+    bool                realloced;
+    bool                bDoTime      = nb->bDoTime;
+    cu_timers_t*        timers       = nb->timers;
+    cu_atomdata_t*      d_atdat      = nb->atdat;
+    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -632,7 +635,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (bDoTime)
      {
          /* time async copy */
-        timers->atdat.openTimingRegion(ls);
+        timers->atdat.openTimingRegion(deviceStream);
      }
  
      /* need to reallocate if we have to copy more atoms than the amount of space
@@ -681,17 +684,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (useLjCombRule(nb->nbparam))
      {
          cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
-                          natoms * sizeof(*d_atdat->lj_comb), ls);
+                          natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
      }
      else
      {
          cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
-                          natoms * sizeof(*d_atdat->atom_types), ls);
+                          natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
      }
  
      if (bDoTime)
      {
-        timers->atdat.closeTimingRegion(ls);
+        timers->atdat.closeTimingRegion(deviceStream);
      }
  }
  
@@ -725,15 +728,6 @@ void gpu_free(NbnxmGpu* nb)
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
  
      delete nb->timers;
-    if (nb->bDoTime)
-    {
-        /* The non-local counters/stream (second in the array) are needed only with DD. */
-        for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
-        {
-            stat = cudaStreamDestroy(nb->stream[i]);
-            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
-        }
-    }
  
      if (!useLjCombRule(nb->nbparam))
      {
@@ -822,11 +816,11 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
      return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
  }
  
-void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
  {
      assert(nb);
  
-    return static_cast<void*>(&nb->stream[iloc]);
+    return &nb->deviceStreams[iloc];
  }
  
  void* gpu_get_xq(NbnxmGpu* nb)
@@ -854,9 +848,9 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
  /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
  {
-    cudaStream_t stream        = gpu_nbv->stream[InteractionLocality::Local];
-    bool         bDoTime       = gpu_nbv->bDoTime;
-    const int    maxNumColumns = gridSet.numColumnsMax();
+    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    bool                bDoTime       = gpu_nbv->bDoTime;
+    const int           maxNumColumns = gridSet.numColumnsMax();
  
      reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
                             &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
@@ -882,15 +876,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
-            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream,
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream,
                                 GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
          }
  
@@ -898,28 +892,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
          {
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
              int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
              destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
          }
      }
@@ -943,7 +939,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                      GpuEventSynchronizer* const localReductionDone)
  {
  
-    cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
  
      GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
      gpu_nbv->localFReductionDone = localReductionDone;
@@ -952,7 +948,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
      {
          reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
                                 DeviceContext());
-        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
+                           GpuApiCallBehavior::Async, nullptr);
      }
  
      return;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index da607e442974f3dc946db8f566cd6c2bd66fd222..d2bbfa6b8ef611b7511d26e8e24db443772d524f 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -303,7 +303,7 @@ struct NbnxmGpu
      /*! \brief staging area where fshift/energies get downloaded */
      nb_staging_t nbst;
      /*! \brief local and non-local GPU streams */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream = { { nullptr } };
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
  
      /*! \brief Events used for synchronization */
      /*! \{ */
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index 65f38daea2b4902e243ccca4a372418f495cb493..f5b3d813da09e7043e864a497d17994730aaa2cb 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
              // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
              wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
  
-            if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+            if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
              {
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
@@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
          }
          else if (haveResultToWaitFor)
          {
-            gpuStreamSynchronize(nb->stream[iLocality]);
+            nb->deviceStreams[iLocality].synchronize();
          }
  
          // TODO: this needs to be moved later because conditional wait could brake timing
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h

index 822852786c80e0417c320b84b6b76d41aab46a0c..e242771862b439eca885acad1fda3f77b82b496f 100644 (file)
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -51,6 +51,7 @@
  #include "gromacs/mdtypes/locality.h"
  
  class DeviceContext;
+class DeviceStream;
  
  struct NbnxmGpu;
  struct gmx_gpu_info_t;
@@ -126,7 +127,7 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu gmx_unused* nb) GPU_FUNC_
   *  Note: CUDA only.
   */
  CUDA_FUNC_QUALIFIER
-void* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
          CUDA_FUNC_TERM_WITH_RETURN(nullptr);
  
  /** Returns an opaque pointer to the GPU coordinate+charge array
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 013bd093a01e3aff9f67f7706e199bedcf4aab2c..ba0c2ee93974aecee7408f2e3675400d4e019e6b 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -484,10 +484,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      /* local/nonlocal offset and length used for xq and f */
      int adat_begin, adat_len;
  
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -522,17 +522,17 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
      }
  
      /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4,
-                       adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream,
+    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
+                       adat_len * sizeof(float) * 4, deviceStream.stream(),
                         bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
      }
  
      /* When we get here all misc operations issues in the local stream as well as
@@ -543,7 +543,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
          if (iloc == InteractionLocality::Local)
          {
              cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+                    deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
              GMX_ASSERT(cl_error == CL_SUCCESS,
                         ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
  
@@ -551,13 +551,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
               * in the local stream in order to be able to sync with the above event
               * from the non-local stream.
               */
-            cl_error = clFlush(stream);
+            cl_error = clFlush(deviceStream.stream());
              GMX_ASSERT(cl_error == CL_SUCCESS,
                         ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
          }
          else
          {
-            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+            sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
          }
      }
  }
@@ -583,11 +583,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
   */
  void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
  {
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_nbparam_t*    nbp    = nb->nbparam;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_nbparam_t*       nbp          = nb->nbparam;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -628,14 +628,14 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
      }
  
      /* kernel launch config */
  
      KernelLaunchConfig config;
      config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.gridSize[0]      = plist->nsci;
@@ -686,7 +686,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
  
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
      }
  }
  
@@ -722,12 +722,12 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
   */
  void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
  {
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_nbparam_t*    nbp     = nb->nbparam;
-    cl_plist_t*      plist   = nb->plist[iloc];
-    cl_timers_t*     t       = nb->timers;
-    cl_command_queue stream  = nb->stream[iloc];
-    bool             bDoTime = nb->bDoTime;
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_nbparam_t*       nbp          = nb->nbparam;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    bool                bDoTime      = nb->bDoTime;
  
      if (plist->haveFreshList)
      {
@@ -781,7 +781,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      /* beginning of timed prune calculation section */
      if (bDoTime)
      {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
      }
  
      /* Kernel launch config:
@@ -795,7 +795,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      /* kernel launch config */
      KernelLaunchConfig config;
      config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.blockSize[2]     = num_threads_z;
@@ -840,7 +840,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
  
      if (bDoTime)
      {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
      }
  }
  
@@ -861,10 +861,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      /* determine interaction locality from atom locality */
      const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
  
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_timers_t*     t       = nb->timers;
-    bool             bDoTime = nb->bDoTime;
-    cl_command_queue stream  = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_timers_t*        t            = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
  
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -886,24 +886,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      /* beginning of timed D2H section */
      if (bDoTime)
      {
-        t->xf[aloc].nb_d2h.openTimingRegion(stream);
+        t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
      }
  
      /* With DD the local D2H transfer can only start after the non-local
         has been launched. */
      if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
      {
-        sync_ocl_event(stream, &(nb->nonlocal_done));
+        sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
      }
  
      /* DtoH f */
      ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
                         adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
-                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream,
+                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
                         bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
  
      /* kick off work */
-    cl_error = clFlush(stream);
+    cl_error = clFlush(deviceStream.stream());
      GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  
      /* After the non-local D2H is launched the nonlocal_done event can be
@@ -912,7 +912,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         data back first. */
      if (iloc == InteractionLocality::NonLocal)
      {
-        cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+        cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
          GMX_ASSERT(cl_error == CL_SUCCESS,
                     ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
          nb->bNonLocalStreamActive = CL_TRUE;
@@ -924,24 +924,25 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
          /* DtoH fshift when virial is needed */
          if (stepWork.computeVirial)
          {
-            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]),
-                               stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+                               SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
+                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
          }
  
          /* DtoH energies */
          if (stepWork.computeEnergy)
          {
-            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream,
+            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
  
-            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream,
+            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
          }
      }
  
      if (bDoTime)
      {
-        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+        t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
      }
  }
  
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index eb1234d5122a0ad1abc040d003a7bc9a58c93249..fa37263a5b16f695d83db5dbbb059d5c8e333174 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -487,7 +487,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
  
      cl_int           cl_error;
      cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      size_t local_work_size[3]  = { 1, 1, 1 };
      size_t global_work_size[3] = { 1, 1, 1 };
@@ -606,10 +606,12 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
          queue_properties = 0;
      }
  
-    /* local/non-local GPU streams */
-    nb->stream[InteractionLocality::Local] =
+    cl_command_queue localStream =
              clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                   nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+    /* local/non-local GPU streams */
+    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
+
      if (CL_SUCCESS != cl_error)
      {
          gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
@@ -620,9 +622,11 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
      {
          init_plist(nb->plist[InteractionLocality::NonLocal]);
  
-        nb->stream[InteractionLocality::NonLocal] =
+        cl_command_queue nonLocalStream =
                  clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                       nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
+
          if (CL_SUCCESS != cl_error)
          {
              gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -675,7 +679,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
      cl_int gmx_used_in_debug cl_error;
  
      cl_atomdata_t*   atomData = nb->atdat;
-    cl_command_queue ls       = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local].stream();
      cl_float         value    = 0.0F;
  
      cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
@@ -697,7 +701,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
  
      /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
      cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->stream[InteractionLocality::Local]);
+    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
      GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  }
  
@@ -708,9 +712,9 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      // Timing accumulation should happen only if there was work to do
      // because getLastRangeTime() gets skipped with empty lists later
      // which leads to the counter not being reset.
-    bool             bDoTime = (nb->bDoTime && !h_plist->sci.empty());
-    cl_command_queue stream  = nb->stream[iloc];
-    cl_plist_t*      d_plist = nb->plist[iloc];
+    bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    cl_plist_t*         d_plist      = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
      {
@@ -730,7 +734,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.pl_h2d.openTimingRegion(deviceStream);
          iTimers.didPairlistH2D = true;
      }
  
@@ -739,12 +743,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                             deviceContext);
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
                             deviceContext);
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
@@ -752,12 +756,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
                             &d_plist->excl_nalloc, deviceContext);
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(deviceStream);
      }
  
      /* need to prune the pair list during the next step */
@@ -768,7 +772,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
      cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -782,13 +786,13 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  //! This function is documented in the header file
  void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  {
-    cl_int           cl_error;
-    int              nalloc, natoms;
-    bool             realloced;
-    bool             bDoTime = nb->bDoTime;
-    cl_timers_t*     timers  = nb->timers;
-    cl_atomdata_t*   d_atdat = nb->atdat;
-    cl_command_queue ls      = nb->stream[InteractionLocality::Local];
+    cl_int              cl_error;
+    int                 nalloc, natoms;
+    bool                realloced;
+    bool                bDoTime      = nb->bDoTime;
+    cl_timers_t*        timers       = nb->timers;
+    cl_atomdata_t*      d_atdat      = nb->atdat;
+    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -796,7 +800,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (bDoTime)
      {
          /* time async copy */
-        timers->atdat.openTimingRegion(ls);
+        timers->atdat.openTimingRegion(deviceStream);
      }
  
      /* need to reallocate if we have to copy more atoms than the amount of space
@@ -859,21 +863,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (useLjCombRule(nb->nbparam->vdwtype))
      {
          ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2),
-                           ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
      }
      else
      {
          ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int),
-                           ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
      }
  
      if (bDoTime)
      {
-        timers->atdat.closeTimingRegion(ls);
+        timers->atdat.closeTimingRegion(deviceStream);
      }
  
      /* kick off the tasks enqueued above to ensure concurrency with the search */
-    cl_error = clFlush(ls);
+    cl_error = clFlush(deviceStream.stream());
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  }
@@ -996,14 +1000,6 @@ void gpu_free(NbnxmGpu* nb)
      pfree(nb->nbst.fshift);
      nb->nbst.fshift = nullptr;
  
-    /* Free command queues */
-    clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
-    nb->stream[InteractionLocality::Local] = nullptr;
-    if (nb->bUseTwoStreams)
-    {
-        clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
-        nb->stream[InteractionLocality::NonLocal] = nullptr;
-    }
      /* Free other events */
      if (nb->nonlocal_done)
      {
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index 6e3a4101614cfa98a5cafd125abc346c78dc33fc..a3583761fa170669aa713be311f4aac28bffefc1 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -361,7 +361,7 @@ struct NbnxmGpu
      nb_staging_t nbst;
  
      //! local and non-local GPU queues
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
  
      /*! \brief Events used for synchronization */
      /*! \{ */
author	Artem Zhmurov <zhmurov@gmail.com>
	Tue, 18 Feb 2020 00:36:38 +0000 (01:36 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Fri, 13 Mar 2020 07:36:27 +0000 (08:36 +0100)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_3dfft.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_3dfft.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_timings.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/gpu_utils/CMakeLists.txt		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/device_stream.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/device_stream.cu	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/device_stream.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/device_stream_ocl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/devicebuffer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpuregiontimer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpuregiontimer.h		patch \| blob \| history
src/gromacs/gpu_utils/gpuregiontimer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/gputraits.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gputraits.h		patch \| blob \| history
src/gromacs/gpu_utils/gputraits_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history
src/gromacs/gpu_utils/tests/typecasts_runner.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded.h		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cpp		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.h		patch \| blob \| history
src/gromacs/listed_forces/gpubondedkernels.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/tests/constrtestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/tests/leapfrogtestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/tests/settletestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.h		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_data_mgmt.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h		patch \| blob \| history