From 8eadec224efff46cb8d0538884ce7d3c1d11c788 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 18 Feb 2020 01:36:38 +0100
Subject: [PATCH] Make DeviceStream into a class

Refs #3314
Refs #3311

Change-Id: Ic270864f0e82af63f91a91c9951bf678795680fa
---
 src/gromacs/domdec/domdec.cpp                 |   4 +-
 src/gromacs/domdec/domdec.h                   |   5 +-
 src/gromacs/domdec/gpuhaloexchange.h          |   5 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cpp   |   4 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cu    |  30 ++--
 src/gromacs/domdec/gpuhaloexchange_impl.cuh   |   8 +-
 src/gromacs/ewald/pme.h                       |   3 +-
 .../ewald/pme_coordinate_receiver_gpu.h       |   3 +-
 .../pme_coordinate_receiver_gpu_impl.cpp      |   2 +-
 .../ewald/pme_coordinate_receiver_gpu_impl.cu |   8 +-
 .../ewald/pme_coordinate_receiver_gpu_impl.h  |   4 +-
 src/gromacs/ewald/pme_force_sender_gpu.h      |   4 +-
 .../ewald/pme_force_sender_gpu_impl.cpp       |   2 +-
 .../ewald/pme_force_sender_gpu_impl.cu        |   8 +-
 src/gromacs/ewald/pme_force_sender_gpu_impl.h |   4 +-
 src/gromacs/ewald/pme_gpu.cpp                 |   2 +-
 src/gromacs/ewald/pme_gpu_3dfft.cu            |   2 +-
 src/gromacs/ewald/pme_gpu_3dfft.h             |   4 +-
 src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp       |  10 +-
 src/gromacs/ewald/pme_gpu_internal.cpp        |  95 ++++++-------
 src/gromacs/ewald/pme_gpu_internal.h          |  10 +-
 src/gromacs/ewald/pme_gpu_timings.cpp         |   4 +-
 src/gromacs/ewald/pme_gpu_types_host_impl.h   |   5 +-
 src/gromacs/ewald/pme_only.cpp                |   8 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |  10 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |   2 +-
 src/gromacs/gpu_utils/CMakeLists.txt          |   6 +
 src/gromacs/gpu_utils/cudautils.cuh           |  18 +--
 src/gromacs/gpu_utils/device_stream.cpp       |  58 ++++++++
 src/gromacs/gpu_utils/device_stream.cu        | 122 ++++++++++++++++
 src/gromacs/gpu_utils/device_stream.h         | 134 ++++++++++++++++++
 src/gromacs/gpu_utils/device_stream_ocl.cpp   |  97 +++++++++++++
 src/gromacs/gpu_utils/devicebuffer.cuh        |  22 +--
 src/gromacs/gpu_utils/devicebuffer_ocl.h      |  35 ++---
 .../gpu_utils/gpueventsynchronizer.cuh        |  10 +-
 .../gpu_utils/gpueventsynchronizer_ocl.h      |  10 +-
 src/gromacs/gpu_utils/gpuregiontimer.cuh      |  12 +-
 src/gromacs/gpu_utils/gpuregiontimer.h        |  14 +-
 src/gromacs/gpu_utils/gpuregiontimer_ocl.h    |   6 +-
 src/gromacs/gpu_utils/gputraits.cuh           |  12 +-
 src/gromacs/gpu_utils/gputraits.h             |   2 -
 src/gromacs/gpu_utils/gputraits_ocl.h         |  10 +-
 src/gromacs/gpu_utils/oclutils.h              |  18 +--
 .../gpu_utils/tests/typecasts_runner.cu       |   7 +-
 src/gromacs/listed_forces/gpubonded.h         |   4 +-
 src/gromacs/listed_forces/gpubonded_impl.cpp  |   2 +-
 src/gromacs/listed_forces/gpubonded_impl.cu   |  24 ++--
 src/gromacs/listed_forces/gpubonded_impl.h    |   7 +-
 src/gromacs/listed_forces/gpubondedkernels.cu |   2 +-
 src/gromacs/mdlib/leapfrog_gpu.cu             |  12 +-
 src/gromacs/mdlib/leapfrog_gpu.cuh            |   6 +-
 src/gromacs/mdlib/lincs_gpu.cu                |  24 ++--
 src/gromacs/mdlib/lincs_gpu.cuh               |   9 +-
 src/gromacs/mdlib/settle_gpu.cu               |  12 +-
 src/gromacs/mdlib/settle_gpu.cuh              |   6 +-
 src/gromacs/mdlib/tests/constrtestrunners.cu  |  13 +-
 .../mdlib/tests/leapfrogtestrunners.cu        |  15 +-
 src/gromacs/mdlib/tests/settletestrunners.cu  |  14 +-
 src/gromacs/mdlib/update_constrain_gpu.h      |   8 +-
 .../mdlib/update_constrain_gpu_impl.cpp       |   2 +-
 .../mdlib/update_constrain_gpu_impl.cu        |  23 ++-
 src/gromacs/mdlib/update_constrain_gpu_impl.h |   8 +-
 src/gromacs/mdrun/md.cpp                      |  17 ++-
 src/gromacs/mdrun/runner.cpp                  |  10 +-
 .../mdtypes/state_propagator_data_gpu.h       |  14 +-
 .../state_propagator_data_gpu_impl.cpp        |  10 +-
 .../mdtypes/state_propagator_data_gpu_impl.h  |  40 +++---
 .../state_propagator_data_gpu_impl_gpu.cpp    | 108 +++++++-------
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 102 ++++++-------
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  99 +++++++------
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |   2 +-
 src/gromacs/nbnxm/gpu_common.h                |   4 +-
 src/gromacs/nbnxm/gpu_data_mgmt.h             |   3 +-
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp        |  85 +++++------
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      |  66 ++++-----
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h    |   2 +-
 76 files changed, 1000 insertions(+), 592 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/device_stream.cpp
 create mode 100644 src/gromacs/gpu_utils/device_stream.cu
 create mode 100644 src/gromacs/gpu_utils/device_stream.h
 create mode 100644 src/gromacs/gpu_utils/device_stream_ocl.cpp

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index e020a1405b..ebcc92bf2e 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3203,8 +3203,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
 void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
                               const t_commrec&     cr,
                               const DeviceContext& deviceContext,
-                              void*                streamLocal,
-                              void*                streamNonLocal)
+                              const DeviceStream&  streamLocal,
+                              const DeviceStream&  streamNonLocal)
 {
 
     int gpuHaloExchangeSize = 0;
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 0a7aa3202e..da617607a6 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -84,6 +84,7 @@ struct t_mdatoms;
 struct t_nrnb;
 struct gmx_wallcycle;
 enum class PbcType : int;
+class DeviceStream;
 class t_state;
 class DeviceContext;
 class GpuEventSynchronizer;
@@ -322,8 +323,8 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
 void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
                               const t_commrec&     cr,
                               const DeviceContext& deviceContext,
-                              void*                streamLocal,
-                              void*                streamNonLocal);
+                              const DeviceStream&  streamLocal,
+                              const DeviceStream&  streamNonLocal);
 
 /*! \brief
  * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index 851e3d1983..b20ad0a180 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -50,6 +50,7 @@
 
 struct gmx_domdec_t;
 class DeviceContext;
+class DeviceStream;
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -89,8 +90,8 @@ public:
     GpuHaloExchange(gmx_domdec_t*        dd,
                     MPI_Comm             mpi_comm_mysim,
                     const DeviceContext& deviceContext,
-                    void*                streamLocal,
-                    void*                streamNonLocal,
+                    const DeviceStream&  streamLocal,
+                    const DeviceStream&  streamNonLocal,
                     int                  pulse);
     ~GpuHaloExchange();
 
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index c8ca5df8c2..1e22b4016e 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -63,8 +63,8 @@ class GpuHaloExchange::Impl
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                  MPI_Comm /* mpi_comm_mysim */,
                                  const DeviceContext& /* deviceContext */,
-                                 void* /*streamLocal */,
-                                 void* /*streamNonLocal */,
+                                 const DeviceStream& /*streamLocal */,
+                                 const DeviceStream& /*streamNonLocal */,
                                  int /*pulse */) :
     impl_(nullptr)
 {
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 4a44beb3e6..0829800111 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -135,7 +135,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
     d_x_ = d_coordinatesBuffer;
     d_f_ = d_forcesBuffer;
 
-    cudaStream_t                 stream  = nonLocalStream_;
     const gmx_domdec_comm_t&     comm    = *dd_->comm;
     const gmx_domdec_comm_dim_t& cd      = comm.cd[0];
     const gmx_domdec_ind_t&      ind     = cd.ind[pulse_];
@@ -167,7 +166,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
     GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
     std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
 
-    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream,
+    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
                        GpuApiCallBehavior::Async, nullptr);
 
     // This rank will push data to its neighbor, so needs to know
@@ -215,7 +214,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
     config.gridSize[1]      = 1;
     config.gridSize[2]      = 1;
     config.sharedMemorySize = 0;
-    config.stream           = nonLocalStream_;
+    config.stream           = nonLocalStream_.stream();
 
     const float3* sendBuf  = d_sendBuf_;
     const float3* d_x      = d_x_;
@@ -264,7 +263,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
         if (!accumulateForces)
         {
             // Clear local portion of force array (in local stream)
-            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
         }
 
         // ensure non-local stream waits for local stream, due to dependence on
@@ -286,7 +285,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     config.gridSize[1]      = 1;
     config.gridSize[2]      = 1;
     config.sharedMemorySize = 0;
-    config.stream           = nonLocalStream_;
+    config.stream           = nonLocalStream_.stream();
 
     const float3* recvBuf  = d_recvBuf_;
     const int*    indexMap = d_indexMap_;
@@ -373,8 +372,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
                                                               int   recvRank)
 {
 
-    cudaError_t  stat;
-    cudaStream_t stream = nonLocalStream_;
+    cudaError_t stat;
 
     // We asynchronously push data to remote rank. The remote
     // destination pointer has already been set in the init fn.  We
@@ -386,7 +384,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
     if (sendSize > 0)
     {
         stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
-                               cudaMemcpyDeviceToDevice, stream);
+                               cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
     }
 
@@ -397,13 +395,13 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
     // to its stream.
     GpuEventSynchronizer* haloDataTransferRemote;
 
-    haloDataTransferLaunched_->markEvent(stream);
+    haloDataTransferLaunched_->markEvent(nonLocalStream_);
 
     MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
                  &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
                  mpi_comm_mysim_, MPI_STATUS_IGNORE);
 
-    haloDataTransferRemote->enqueueWaitEvent(stream);
+    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
 #else
     GMX_UNUSED_VALUE(sendRank);
     GMX_UNUSED_VALUE(recvRank);
@@ -419,8 +417,8 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
 GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                             MPI_Comm             mpi_comm_mysim,
                             const DeviceContext& deviceContext,
-                            void*                localStream,
-                            void*                nonLocalStream,
+                            const DeviceStream&  localStream,
+                            const DeviceStream&  nonLocalStream,
                             int                  pulse) :
     dd_(dd),
     sendRankX_(dd->neighbor[0][1]),
@@ -431,8 +429,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
     haloDataTransferLaunched_(new GpuEventSynchronizer()),
     mpi_comm_mysim_(mpi_comm_mysim),
     deviceContext_(deviceContext),
-    localStream_(*static_cast<cudaStream_t*>(localStream)),
-    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+    localStream_(localStream),
+    nonLocalStream_(nonLocalStream),
     pulse_(pulse)
 {
 
@@ -466,8 +464,8 @@ GpuHaloExchange::Impl::~Impl()
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                  MPI_Comm             mpi_comm_mysim,
                                  const DeviceContext& deviceContext,
-                                 void*                localStream,
-                                 void*                nonLocalStream,
+                                 const DeviceStream&  localStream,
+                                 const DeviceStream&  nonLocalStream,
                                  int                  pulse) :
     impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
 {
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index ba22bc5262..9a033ed54f 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -79,8 +79,8 @@ public:
     Impl(gmx_domdec_t*        dd,
          MPI_Comm             mpi_comm_mysim,
          const DeviceContext& deviceContext,
-         void*                localStream,
-         void*                nonLocalStream,
+         const DeviceStream&  localStream,
+         const DeviceStream&  nonLocalStream,
          int                  pulse);
     ~Impl();
 
@@ -185,9 +185,9 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! CUDA stream for local non-bonded calculations
-    cudaStream_t localStream_ = nullptr;
+    const DeviceStream& localStream_;
     //! CUDA stream for non-local non-bonded calculations
-    cudaStream_t nonLocalStream_ = nullptr;
+    const DeviceStream& nonLocalStream_;
     //! full coordinates buffer in GPU memory
     float3* d_x_ = nullptr;
     //! full forces buffer in GPU memory
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 1c3cb9b774..8aa2c079b1 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -72,6 +72,7 @@ struct gmx_wallcycle;
 struct NumPmeDomains;
 
 class DeviceContext;
+class DeviceStream;
 enum class GpuTaskCompletion;
 class PmeGpuProgram;
 class GpuEventSynchronizer;
@@ -433,7 +434,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT
  *  \param[in] pme            The PME data structure.
  *  \returns                  Pointer to GPU stream object.
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
index b5d02a719e..144bd27fdd 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -47,6 +47,7 @@
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
+class DeviceStream;
 struct PpRanks;
 
 namespace gmx
@@ -64,7 +65,7 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    PmeCoordinateReceiverGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~PmeCoordinateReceiverGpu();
 
     /*! \brief
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
index 0cb848e6c0..b0da71cc47 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -62,7 +62,7 @@ class PmeCoordinateReceiverGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
                                                    MPI_Comm /* comm */,
                                                    gmx::ArrayRef<PpRanks> /* ppRanks */) :
     impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
index b2e7fa009d..db81fb7b0a 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -55,8 +55,10 @@
 namespace gmx
 {
 
-PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream&    pmeStream,
+                                     MPI_Comm               comm,
+                                     gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeStream_(pmeStream),
     comm_(comm),
     ppRanks_(ppRanks)
 {
@@ -122,7 +124,7 @@ void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDire
     }
 }
 
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void*            pmeStream,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream&    pmeStream,
                                                    MPI_Comm               comm,
                                                    gmx::ArrayRef<PpRanks> ppRanks) :
     impl_(new Impl(pmeStream, comm, ppRanks))
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
index 4f3bbe2e4e..e1186a2f3a 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -62,7 +62,7 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~Impl();
 
     /*! \brief
@@ -84,7 +84,7 @@ public:
 
 private:
     //! CUDA stream for PME operations
-    cudaStream_t pmeStream_ = nullptr;
+    const DeviceStream& pmeStream_;
     //! communicator for simulation
     MPI_Comm comm_;
     //! list of PP ranks
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h
index c774994824..df8e1873f8 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -46,6 +46,8 @@
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
+class DeviceStream;
+
 /*! \libinternal
  * \brief Contains information about the PP ranks that partner this PME rank. */
 struct PpRanks
@@ -73,7 +75,7 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~PmeForceSenderGpu();
 
     /*! \brief
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
index b4a531fed9..3ae502ccd8 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -61,7 +61,7 @@ class PmeForceSenderGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmeForceSenderGpu::PmeForceSenderGpu(const void* /*pmeStream */,
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
                                      MPI_Comm /* comm     */,
                                      gmx::ArrayRef<PpRanks> /* ppRanks */) :
     impl_(nullptr)
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index 0ad8fbf123..6e6d21eaf2 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -55,8 +55,8 @@ namespace gmx
 {
 
 /*! \brief Create PME-PP GPU communication object */
-PmeForceSenderGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
-    pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+    pmeStream_(pmeStream),
     comm_(comm),
     ppRanks_(ppRanks)
 {
@@ -106,7 +106,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank)
 #endif
 }
 
-PmeForceSenderGpu::PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
+                                     MPI_Comm               comm,
+                                     gmx::ArrayRef<PpRanks> ppRanks) :
     impl_(new Impl(pmeStream, comm, ppRanks))
 {
 }
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index 5c1271b2dd..91fe1c1140 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -61,7 +61,7 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] ppRanks         List of PP ranks
      */
-    Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
     ~Impl();
 
     /*! \brief
@@ -78,7 +78,7 @@ public:
 
 private:
     //! CUDA stream for PME operations
-    cudaStream_t pmeStream_ = nullptr;
+    const DeviceStream& pmeStream_;
     //! Event triggered when to allow remote PP stream to syn with pme stream
     GpuEventSynchronizer pmeSync_;
     //! communicator for simulation
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 4c4ed4851d..cbcab23b1a 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -433,7 +433,7 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
     pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
 }
 
-void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
+const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
     {
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu
index 2b30dcdef1..9f9578962e 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.cu
+++ b/src/gromacs/ewald/pme_gpu_3dfft.cu
@@ -104,7 +104,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                            realGridSizePaddedTotal, CUFFT_C2R, batch);
     handleCufftError(result, "cufftPlanMany C2R plan failure");
 
-    cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
+    cudaStream_t stream = pmeGpu->archSpecific->pmeStream_.stream();
     GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT");
 
     result = cufftSetStream(planR2C_, stream);
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h
index 07d3b1af57..fc6d67a935 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.h
+++ b/src/gromacs/ewald/pme_gpu_3dfft.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -93,7 +93,7 @@ private:
 #elif GMX_GPU == GMX_GPU_OPENCL
     clfftPlanHandle               planR2C_;
     clfftPlanHandle               planC2R_;
-    std::vector<cl_command_queue> commandStreams_;
+    std::vector<cl_command_queue> deviceStreams_;
     cl_mem                        realGrid_;
     cl_mem                        complexGrid_;
 #endif
diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
index c6e1b6448c..b341a27b82 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
@@ -81,7 +81,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                    "Complex padding not implemented");
     }
     cl_context context = pmeGpu->archSpecific->deviceContext_.context();
-    commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
+    deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream());
     realGrid_                       = kernelParamsPtr->grid.d_realGrid;
     complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid;
     const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
@@ -124,9 +124,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
     handleClfftError(clfftSetPlanOutStride(planC2R_, dims, realGridStrides.data()),
                      "clFFT stride setting failure");
 
-    handleClfftError(clfftBakePlan(planR2C_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+    handleClfftError(clfftBakePlan(planR2C_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
                      "clFFT precompiling failure");
-    handleClfftError(clfftBakePlan(planC2R_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+    handleClfftError(clfftBakePlan(planC2R_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
                      "clFFT precompiling failure");
 
     // TODO: implement solve kernel as R2C FFT callback
@@ -166,8 +166,8 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingE
             GMX_THROW(
                     gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
     }
-    handleClfftError(clfftEnqueueTransform(plan, direction, commandStreams_.size(),
-                                           commandStreams_.data(), waitEvents.size(), waitEvents.data(),
+    handleClfftError(clfftEnqueueTransform(plan, direction, deviceStreams_.size(),
+                                           deviceStreams_.data(), waitEvents.size(), waitEvents.data(),
                                            timingEvent, inputGrids, outputGrids, tempBuffer),
                      "clFFT execution failure");
 }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index dd62e8c4cd..822109de4c 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -135,7 +135,7 @@ int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
 
 void pme_gpu_synchronize(const PmeGpu* pmeGpu)
 {
-    gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
+    pmeGpu->archSpecific->pmeStream_.synchronize();
 }
 
 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
@@ -156,7 +156,7 @@ void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
 {
     clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
-                           c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
+                           c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
 }
 
 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
@@ -188,7 +188,7 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
     }
     /* TODO: pin original buffer instead! */
     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
-                       0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream,
+                       0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_,
                        pmeGpu->settings.transferKind, nullptr);
 }
 
@@ -219,7 +219,7 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
-                       DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
                        pmeGpu->settings.transferKind, nullptr);
 }
 
@@ -228,7 +228,7 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
     copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
-                         DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
                          pmeGpu->settings.transferKind, nullptr);
 }
 
@@ -243,7 +243,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
                            pmeGpu->archSpecific->deviceContext_);
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     if (c_usePadding)
     {
         const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
@@ -251,7 +251,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
         if (paddingCount > 0)
         {
             clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream);
+                                   paddingCount, pmeGpu->archSpecific->pmeStream_);
         }
     }
 }
@@ -360,7 +360,7 @@ void pme_gpu_free_grids(const PmeGpu* pmeGpu)
 void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
 {
     clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
-                           pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream);
+                           pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_);
 }
 
 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
@@ -393,10 +393,10 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
     allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
                          pmeGpu->archSpecific->deviceContext_);
     copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
                        GpuApiCallBehavior::Async, nullptr);
     copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
                        GpuApiCallBehavior::Async, nullptr);
 #endif
 }
@@ -417,21 +417,21 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
 
 bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
 {
-    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_);
 }
 
 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
 {
     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
 }
 
 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
 {
     copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
-                         pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream,
+                         pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_,
                          pmeGpu->settings.transferKind, nullptr);
-    pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
+    pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
 }
 
 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
@@ -441,11 +441,11 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
     const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
     copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
-                         0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+                         0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
                          pmeGpu->settings.transferKind, nullptr);
 }
 
@@ -459,20 +459,20 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
     {
         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
-                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->archSpecific->pmeStream_);
         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
-                               pmeGpu->archSpecific->pmeStream);
+                               pmeGpu->archSpecific->pmeStream_);
     }
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
-                       0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+                       0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
                        pmeGpu->settings.transferKind, nullptr);
 }
 
@@ -529,35 +529,25 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
     int         highest_priority, lowest_priority;
     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
-    stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
+    cudaStream_t stream;
+    stat = cudaStreamCreateWithPriority(&stream,
                                         cudaStreamDefault, // cudaStreamNonBlocking,
                                         highest_priority);
+    pmeGpu->archSpecific->pmeStream_.setStream(stream);
     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
 #elif GMX_GPU == GMX_GPU_OPENCL
     cl_command_queue_properties queueProperties =
             pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
     cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
     cl_int       clError;
-    pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
-            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
-    if (clError != CL_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
-    }
-#endif
-}
+    pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue(
+            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError));
+
 
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
-{
-#if GMX_GPU == GMX_GPU_CUDA
-    /* Destroy the CUDA stream */
-    cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "PME cudaStreamDestroy error");
-#elif GMX_GPU == GMX_GPU_OPENCL
-    cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
     if (clError != CL_SUCCESS)
     {
-        gmx_warning("Failed to destroy PME command queue");
+        GMX_THROW(gmx::InternalError(
+                gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str()));
     }
 #endif
 }
@@ -979,9 +969,6 @@ void pme_gpu_destroy(PmeGpu* pmeGpu)
 
     pme_gpu_destroy_3dfft(pmeGpu);
 
-    /* Free the GPU-framework specific data last */
-    pme_gpu_destroy_specific(pmeGpu);
-
     delete pmeGpu;
 }
 
@@ -1205,7 +1192,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
                "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
     if (xReadyOnDevice)
     {
-        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
     }
 
     const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
@@ -1217,7 +1204,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     config.blockSize[2] = atomsPerBlock;
     config.gridSize[0]  = dimGrid.first;
     config.gridSize[1]  = dimGrid.second;
-    config.stream       = pmeGpu->archSpecific->pmeStream;
+    config.stream       = pmeGpu->archSpecific->pmeStream_.stream();
 
     int                                timingId;
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1285,7 +1272,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     if (copyInputAndOutputGrid)
     {
         copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
-                           pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+                           pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
                            pmeGpu->settings.transferKind, nullptr);
     }
 
@@ -1335,7 +1322,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
                          / gridLinesPerBlock;
     config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
-    config.stream      = pmeGpu->archSpecific->pmeStream;
+    config.stream      = pmeGpu->archSpecific->pmeStream_.stream();
 
     int                                timingId  = gtPME_SOLVE;
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1366,13 +1353,13 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     {
         copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
                              &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
-                             pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+                             pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     }
 
     if (copyInputAndOutputGrid)
     {
         copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
-                             pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+                             pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
                              pmeGpu->settings.transferKind, nullptr);
     }
 }
@@ -1457,7 +1444,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
     config.blockSize[2] = atomsPerBlock;
     config.gridSize[0]  = dimGrid.first;
     config.gridSize[1]  = dimGrid.second;
-    config.stream       = pmeGpu->archSpecific->pmeStream;
+    config.stream       = pmeGpu->archSpecific->pmeStream_.stream();
 
     // TODO test different cache configs
 
@@ -1483,7 +1470,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
 
     if (pmeGpu->settings.useGpuForceReduction)
     {
-        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream_);
     }
     else
     {
@@ -1515,11 +1502,11 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx:
     pmeGpu->kernelParams->atoms.d_coordinates = d_x;
 }
 
-void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
+const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
 {
     if (pmeGpu)
     {
-        return static_cast<void*>(&pmeGpu->archSpecific->pmeStream);
+        return &pmeGpu->archSpecific->pmeStream_;
     }
     else
     {
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index 67a1bc3d1c..b515e3b222 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -313,14 +313,6 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
  */
 void pme_gpu_init_internal(PmeGpu* pmeGpu);
 
-/*! \libinternal \brief
- * Destroys the PME GPU-framework specific data.
- * Should be called last in the PME GPU destructor.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu);
-
 /*! \libinternal \brief
  * Initializes the CUDA FFT structures.
  *
@@ -405,7 +397,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_A
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to stream object.
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu_timings.cpp b/src/gromacs/ewald/pme_gpu_timings.cpp
index 3680631c57..3a1f457468 100644
--- a/src/gromacs/ewald/pme_gpu_timings.cpp
+++ b/src/gromacs/ewald/pme_gpu_timings.cpp
@@ -61,7 +61,7 @@ void pme_gpu_start_timing(const PmeGpu* pmeGpu, size_t PMEStageId)
     {
         GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
                    "Wrong PME GPU timing event index");
-        pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream_);
     }
 }
 
@@ -71,7 +71,7 @@ void pme_gpu_stop_timing(const PmeGpu* pmeGpu, size_t PMEStageId)
     {
         GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
                    "Wrong PME GPU timing event index");
-        pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream);
+        pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream_);
     }
 }
 
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index 44ca3fd3c3..a019a7c031 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -74,8 +74,6 @@ struct PmeGpuSpecific
      * \param[in] deviceContext GPU device context.
      */
     PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
-    /*! \brief The GPU stream where everything related to the PME happens. */
-    CommandStream pmeStream;
 
     /*! \brief
      * A handle to the GPU context.
@@ -85,6 +83,9 @@ struct PmeGpuSpecific
      */
     const DeviceContext& deviceContext_;
 
+    /*! \brief The GPU stream where everything related to the PME happens. */
+    DeviceStream pmeStream_;
+
     /* Synchronization events */
     /*! \brief Triggered after the PME Force Calculations have been completed */
     GpuEventSynchronizer pmeForcesReady;
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 845b1a33ec..fe51deb5fc 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -629,16 +629,16 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
     const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
     if (useGpuForPme)
     {
-        const void* commandStream = pme_gpu_get_device_stream(pme);
+        const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
 
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
         if (c_enableGpuPmePpComms)
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
         }
         GMX_RELEASE_ASSERT(
                 deviceContext != nullptr,
@@ -646,7 +646,7 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
         stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                commandStream, *deviceContext, GpuApiCallBehavior::Async,
+                &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
                 pme_gpu_get_padding_size(pme), wcycle);
     }
 
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 91962ff947..f5aac2981a 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -64,7 +64,9 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
             "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-    cudaStreamCreate(&pmePpCommStream_);
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+    pmePpCommStream_.setStream(stream);
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
@@ -98,7 +100,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
     // Pull force data from remote GPU
     void*       pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
     cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, remotePmeFBuffer_, recvSize * DIM * sizeof(float),
-                                       cudaMemcpyDefault, pmePpCommStream_);
+                                       cudaMemcpyDefault, pmePpCommStream_.stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
 
     if (receivePmeForceToGpu)
@@ -112,7 +114,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
     {
         // Ensure CPU waits for PME forces to be copied before reducing
         // them with other forces on the CPU
-        cudaStreamSynchronize(pmePpCommStream_);
+        cudaStreamSynchronize(pmePpCommStream_.stream());
     }
 #else
     GMX_UNUSED_VALUE(recvPtr);
@@ -131,7 +133,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
     coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
 
     cudaError_t stat = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr, sendSize * DIM * sizeof(float),
-                                       cudaMemcpyDefault, pmePpCommStream_);
+                                       cudaMemcpyDefault, pmePpCommStream_.stream());
     CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
 
     // Record and send event to allow PME task to sync to above transfer before commencing force calculations
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index c791ea5b40..934b7c40c6 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -119,7 +119,7 @@ private:
     //! Device context object
     const DeviceContext& deviceContext_;
     //! CUDA stream used for the communication operations in this class
-    cudaStream_t pmePpCommStream_ = nullptr;
+    DeviceStream pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
     void* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt
index 8672e450ca..ce70d0b049 100644
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -45,6 +45,7 @@ gmx_add_libgromacs_sources(
 if(GMX_USE_OPENCL)
     gmx_add_libgromacs_sources(
         device_context_ocl.cpp
+        device_stream_ocl.cpp
         gpu_utils_ocl.cpp
         ocl_compiler.cpp
         ocl_caching.cpp
@@ -53,10 +54,15 @@ if(GMX_USE_OPENCL)
 elseif(GMX_USE_CUDA)
     gmx_add_libgromacs_sources(
         cudautils.cu
+        device_stream.cu
         gpu_utils.cu
         pinning.cu
         pmalloc_cuda.cu
         )
+elseif()
+    gmx_add_libgromacs_sources(
+        device_stream.cpp
+    )
 endif()
 
 if (BUILD_TESTING)
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 71d9b7dac4..48212bf3bc 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -216,25 +216,15 @@ static inline void rvec_inc(rvec a, const float3 b)
     rvec_inc(a, tmp);
 }
 
-/*! \brief Wait for all taks in stream \p s to complete.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cudaStream_t s)
-{
-    cudaError_t stat = cudaStreamSynchronize(s);
-    CU_RET_ERR(stat, "cudaStreamSynchronize failed");
-}
-
 /*! \brief  Returns true if all tasks in \p s have completed.
  *
- * \param[in] s stream to check
+ *  \param[in] deviceStream CUDA stream to check.
  *
- *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ *  \returns True if all tasks enqueued in the stream \p deviceStream (at the time of this call) have completed.
  */
-static inline bool haveStreamTasksCompleted(cudaStream_t s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream)
 {
-    cudaError_t stat = cudaStreamQuery(s);
+    cudaError_t stat = cudaStreamQuery(deviceStream.stream());
 
     if (stat == cudaErrorNotReady)
     {
diff --git a/src/gromacs/gpu_utils/device_stream.cpp b/src/gromacs/gpu_utils/device_stream.cpp
new file mode 100644
index 0000000000..1b5b016fba
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.cpp
@@ -0,0 +1,58 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+DeviceStream::DeviceStream() = default;
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+                        const DeviceContext& /* deviceContext */,
+                        DeviceStreamPriority /* priority */,
+                        const bool /* useTiming */)
+{
+}
+
+DeviceStream::~DeviceStream() = default;
+
+void DeviceStream::synchronize() const {}
\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/device_stream.cu b/src/gromacs/gpu_utils/device_stream.cu
new file mode 100644
index 0000000000..8d0b484846
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.cu
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+    stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+                        const DeviceContext& /* deviceContext */,
+                        DeviceStreamPriority priority,
+                        const bool /* useTiming */)
+{
+    cudaError_t stat;
+
+    if (priority == DeviceStreamPriority::Normal)
+    {
+        stat = cudaStreamCreate(&stream_);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
+        }
+    }
+    else if (priority == DeviceStreamPriority::High)
+    {
+        // Note that the device we're running on does not have to
+        // support priorities, because we are querying the priority
+        // range, which in that case will be a single value.
+        int highestPriority;
+        stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
+                    cudaGetErrorString(stat))));
+        }
+
+        stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
+                    cudaGetErrorString(stat))));
+        }
+    }
+}
+
+DeviceStream::~DeviceStream()
+{
+    if (stream_)
+    {
+        cudaError_t stat = cudaStreamDestroy(stream_);
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
+                                             stat, cudaGetErrorString(stat))
+                                   .c_str());
+        stream_ = nullptr;
+    }
+}
+
+cudaStream_t DeviceStream::stream() const
+{
+    return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+    cudaError_t stat = cudaStreamSynchronize(stream_);
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       gmx::formatString("cudaStreamSynchronize failed  (CUDA error %d: %s).", stat,
+                                         cudaGetErrorString(stat))
+                               .c_str());
+}
\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/device_stream.h b/src/gromacs/gpu_utils/device_stream.h
new file mode 100644
index 0000000000..2e654e529b
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream.h
@@ -0,0 +1,134 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_STREAM_H
+#define GMX_GPU_UTILS_DEVICE_STREAM_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceStream class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+#    include "gromacs/gpu_utils/gmxopencl.h"
+#endif
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+class DeviceContext;
+
+//! Enumeration describing the priority with which a stream operates.
+enum class DeviceStreamPriority : int
+{
+    //! High-priority stream
+    High,
+    //! Normal-priority stream
+    Normal,
+    //! Conventional termination of the enumeration
+    Count
+};
+
+// Stub for device context
+class DeviceStream
+{
+public:
+    //! Default constructor
+    DeviceStream();
+    //! Destructor
+    ~DeviceStream();
+
+    /*! \brief Initialize
+     *
+     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
+     * \param[in] deviceContext  Device context (not used in CUDA).
+     * \param[in] priority       Stream priority: high or normal.
+     * \param[in] useTiming      If the timing should be enabled (not used in CUDA).
+     */
+    void init(const DeviceInformation& deviceInfo,
+              const DeviceContext&     deviceContext,
+              DeviceStreamPriority     priority,
+              const bool               useTiming);
+
+    /*! \brief Construct and init.
+     *
+     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
+     * \param[in] deviceContext  Device context (only used in OpenCL).
+     * \param[in] priority       Stream priority: high or normal (only used in CUDA).
+     * \param[in] useTiming      If the timing should be enabled (only used in OpenCL).
+     */
+    DeviceStream(const DeviceInformation& deviceInfo,
+                 const DeviceContext&     deviceContext,
+                 DeviceStreamPriority     priority,
+                 const bool               useTiming)
+    {
+        init(deviceInfo, deviceContext, priority, useTiming);
+    }
+
+    //! Synchronize the steam
+    void synchronize() const;
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+    //! Getter
+    cudaStream_t stream() const;
+    //! Setter (temporary, will be removed in the follow-up)
+    void setStream(cudaStream_t stream) { stream_ = stream; }
+
+private:
+    cudaStream_t stream_ = nullptr;
+
+#elif GMX_GPU == GMX_GPU_OPENCL
+
+    //! Getter
+    cl_command_queue stream() const;
+    //! Setter (temporary, will be removed in the follow-up)
+    void setStream(cl_command_queue stream) { stream_ = stream; }
+
+private:
+    cl_command_queue stream_ = nullptr;
+
+#endif
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceStream);
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_STREAM_H
diff --git a/src/gromacs/gpu_utils/device_stream_ocl.cpp b/src/gromacs/gpu_utils/device_stream_ocl.cpp
new file mode 100644
index 0000000000..013480aacf
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_stream_ocl.cpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceStream for OpenCL.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/device_context_ocl.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+    stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& deviceInfo,
+                        const DeviceContext&     deviceContext,
+                        DeviceStreamPriority /* priority */,
+                        const bool useTiming)
+{
+    cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
+    cl_device_id                deviceId        = deviceInfo.oclDeviceId;
+    cl_int                      clError;
+    stream_ = clCreateCommandQueue(deviceContext.context(), deviceId, queueProperties, &clError);
+    if (clError != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError(gmx::formatString(
+                "Failed to create OpenCL command queue on GPU %s (OpenCL error ID %d).",
+                deviceInfo.device_name, clError)));
+    }
+}
+
+DeviceStream::~DeviceStream()
+{
+    if (stream_)
+    {
+        cl_int clError = clReleaseCommandQueue(stream_);
+        GMX_RELEASE_ASSERT(
+                clError == CL_SUCCESS,
+                gmx::formatString("Failed to release OpenCL stream (OpenCL error ID %d).", clError).c_str());
+        stream_ = nullptr;
+    }
+}
+
+cl_command_queue DeviceStream::stream() const
+{
+    return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+    cl_int clError = clFinish(stream_);
+    GMX_RELEASE_ASSERT(
+            CL_SUCCESS == clError,
+            gmx::formatString("Error caught during clFinish (OpenCL error ID %d).", clError).c_str());
+}
\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index 59255bfa93..d4bfe8c35b 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -96,7 +96,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
  * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
  * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy into.
  * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
  * \param[in]     transferKind         Copy type: synchronous or asynchronous.
  * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
  *                                     Not used in CUDA implementation.
@@ -106,7 +106,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                         const ValueType*         hostBuffer,
                         size_t                   startingOffset,
                         size_t                   numValues,
-                        CommandStream            stream,
+                        const DeviceStream&      deviceStream,
                         GpuApiCallBehavior       transferKind,
                         CommandEvent* /*timingEvent*/)
 {
@@ -125,7 +125,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
             GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                        "Source host buffer was not pinned for CUDA");
             stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
-                                   cudaMemcpyHostToDevice, stream);
+                                   cudaMemcpyHostToDevice, deviceStream.stream());
             GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
             break;
 
@@ -150,7 +150,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
  * \param[in]     buffer               Pointer to the device-side buffer
  * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy from.
  * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
  * \param[in]     transferKind         Copy type: synchronous or asynchronous.
  * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
  *                                     Not used in CUDA implementation.
@@ -160,7 +160,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                           DeviceBuffer<ValueType>* buffer,
                           size_t                   startingOffset,
                           size_t                   numValues,
-                          CommandStream            stream,
+                          const DeviceStream&      deviceStream,
                           GpuApiCallBehavior       transferKind,
                           CommandEvent* /*timingEvent*/)
 {
@@ -175,7 +175,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
             GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                        "Destination host buffer was not pinned for CUDA");
             stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
-                                   cudaMemcpyDeviceToHost, stream);
+                                   cudaMemcpyDeviceToHost, deviceStream.stream());
             GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
             break;
 
@@ -196,16 +196,20 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
  * \param[in,out] buffer          Pointer to the device-side buffer
  * \param[in]     startingOffset  Offset (in values) at the device-side buffer to start clearing at.
  * \param[in]     numValues       Number of values to clear.
- * \param[in]     stream          GPU stream.
+ * \param[in]     deviceStream    GPU stream.
  */
 template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+                            size_t                   startingOffset,
+                            size_t                   numValues,
+                            const DeviceStream&      deviceStream)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     const size_t bytes   = numValues * sizeof(ValueType);
     const char   pattern = 0;
 
-    cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes, stream);
+    cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
+                                       deviceStream.stream());
     GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
 }
 
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index ee1adc1cce..05be260b89 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -108,7 +108,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
  * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
  * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy into.
  * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
  * \param[in]     transferKind         Copy type: synchronous or asynchronous.
  * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
  *                                     If the pointer is not null, the event can further be used
@@ -119,7 +119,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                         const ValueType*         hostBuffer,
                         size_t                   startingOffset,
                         size_t                   numValues,
-                        CommandStream            stream,
+                        const DeviceStream&      deviceStream,
                         GpuApiCallBehavior       transferKind,
                         CommandEvent*            timingEvent)
 {
@@ -135,8 +135,8 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
     switch (transferKind)
     {
         case GpuApiCallBehavior::Async:
-            clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
-                                           nullptr, timingEvent);
+            clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+                                           hostBuffer, 0, nullptr, timingEvent);
             GMX_RELEASE_ASSERT(
                     clError == CL_SUCCESS,
                     gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError,
@@ -145,8 +145,8 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
             break;
 
         case GpuApiCallBehavior::Sync:
-            clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
-                                           nullptr, timingEvent);
+            clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+                                           hostBuffer, 0, nullptr, timingEvent);
             GMX_RELEASE_ASSERT(
                     clError == CL_SUCCESS,
                     gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError,
@@ -168,7 +168,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
  * \param[in]     buffer               Pointer to the device-side buffer
  * \param[in]     startingOffset       Offset (in values) at the device-side buffer to copy from.
  * \param[in]     numValues            Number of values to copy.
- * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     deviceStream         GPU stream to perform asynchronous copy in.
  * \param[in]     transferKind         Copy type: synchronous or asynchronous.
  * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
  *                                     If the pointer is not null, the event can further be used
@@ -179,7 +179,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                           DeviceBuffer<ValueType>* buffer,
                           size_t                   startingOffset,
                           size_t                   numValues,
-                          CommandStream            stream,
+                          const DeviceStream&      deviceStream,
                           GpuApiCallBehavior       transferKind,
                           CommandEvent*            timingEvent)
 {
@@ -191,8 +191,8 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
     switch (transferKind)
     {
         case GpuApiCallBehavior::Async:
-            clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
-                                          nullptr, timingEvent);
+            clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+                                          hostBuffer, 0, nullptr, timingEvent);
             GMX_RELEASE_ASSERT(
                     clError == CL_SUCCESS,
                     gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError,
@@ -201,8 +201,8 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
             break;
 
         case GpuApiCallBehavior::Sync:
-            clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
-                                          nullptr, timingEvent);
+            clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+                                          hostBuffer, 0, nullptr, timingEvent);
             GMX_RELEASE_ASSERT(
                     clError == CL_SUCCESS,
                     gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError,
@@ -221,10 +221,13 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
  * \param[in,out] buffer          Pointer to the device-side buffer
  * \param[in]     startingOffset  Offset (in values) at the device-side buffer to start clearing at.
  * \param[in]     numValues       Number of values to clear.
- * \param[in]     stream          GPU stream.
+ * \param[in]     deviceStream    GPU stream.
  */
 template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+                            size_t                   startingOffset,
+                            size_t                   numValues,
+                            const DeviceStream&      deviceStream)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     const size_t    offset        = startingOffset * sizeof(ValueType);
@@ -233,8 +236,8 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
     const cl_uint   numWaitEvents = 0;
     const cl_event* waitEvents    = nullptr;
     cl_event        commandEvent;
-    cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes,
-                                         numWaitEvents, waitEvents, &commandEvent);
+    cl_int clError = clEnqueueFillBuffer(deviceStream.stream(), *buffer, &pattern, sizeof(pattern),
+                                         offset, bytes, numWaitEvents, waitEvents, &commandEvent);
     GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
                        gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)",
                                          clError, ocl_get_error_string(clError).c_str())
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
index e2e6ac8704..69487de586 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -80,9 +80,9 @@ public:
     /*! \brief Marks the synchronization point in the \p stream.
      * Should be followed by waitForEvent().
      */
-    inline void markEvent(CommandStream stream)
+    inline void markEvent(const DeviceStream& deviceStream)
     {
-        cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, stream);
+        cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
         GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
     }
     /*! \brief Synchronizes the host thread on the marked event. */
@@ -92,9 +92,9 @@ public:
         GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
     }
     /*! \brief Enqueues a wait for the recorded event in stream \p stream */
-    inline void enqueueWaitEvent(CommandStream stream)
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
     {
-        cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(stream, event_, 0);
+        cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
         GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
     }
 
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
index b9298f385f..9a62b5b6d4 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -85,10 +85,10 @@ public:
     /*! \brief Marks the synchronization point in the \p stream.
      * Should be called first and then followed by waitForEvent().
      */
-    inline void markEvent(CommandStream stream)
+    inline void markEvent(const DeviceStream& deviceStream)
     {
         GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
-        cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_);
+        cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_);
         if (CL_SUCCESS != clError)
         {
             GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: "
@@ -112,9 +112,9 @@ public:
      *  After enqueue, the associated event is released, so this method should
      *  be only called once per markEvent() call.
      */
-    inline void enqueueWaitEvent(CommandStream stream)
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
     {
-        cl_int clError = clEnqueueBarrierWithWaitList(stream, 1, &event_, nullptr);
+        cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
         if (CL_SUCCESS != clError)
         {
             GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh
index 52e36c038a..c56d60da61 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -79,15 +79,17 @@ public:
     GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
 
     /*! \brief Will be called before the region start. */
-    inline void openTimingRegion(CommandStream s)
+    inline void openTimingRegion(const DeviceStream& deviceStream)
     {
-        CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure");
+        CU_RET_ERR(cudaEventRecord(eventStart_, deviceStream.stream()),
+                   "GPU timing recording failure");
     }
 
     /*! \brief Will be called after the region end. */
-    inline void closeTimingRegion(CommandStream s)
+    inline void closeTimingRegion(const DeviceStream& deviceStream)
     {
-        CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure");
+        CU_RET_ERR(cudaEventRecord(eventStop_, deviceStream.stream()),
+                   "GPU timing recording failure");
     }
 
     /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h
index 6798159ac9..f0860b164d 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -84,9 +84,9 @@ public:
     /*! \brief
      * To be called before the region start.
      *
-     * \param[in] s   The GPU command stream where the event being measured takes place.
+     * \param[in] deviceStream   The GPU command stream where the event being measured takes place.
      */
-    void openTimingRegion(CommandStream s)
+    void openTimingRegion(const DeviceStream& deviceStream)
     {
         if (c_debugTimerState)
         {
@@ -96,14 +96,14 @@ public:
             GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str());
             debugState_ = TimerState::Recording;
         }
-        impl_.openTimingRegion(s);
+        impl_.openTimingRegion(deviceStream);
     }
     /*! \brief
      * To be called after the region end.
      *
-     * \param[in] s   The GPU command stream where the event being measured takes place.
+     * \param[in] deviceStream   The GPU command stream where the event being measured takes place.
      */
-    void closeTimingRegion(CommandStream s)
+    void closeTimingRegion(const DeviceStream& deviceStream)
     {
         if (c_debugTimerState)
         {
@@ -114,7 +114,7 @@ public:
             debugState_ = TimerState::Stopped;
         }
         callCount_++;
-        impl_.closeTimingRegion(s);
+        impl_.closeTimingRegion(deviceStream);
     }
     /*! \brief
      * Accumulates the last timespan of all the events used into the total duration,
diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
index 3c1d9b2b84..788e41de5d 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -82,9 +82,9 @@ public:
     GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
 
     /*! \brief Should be called before the region start. */
-    inline void openTimingRegion(CommandStream /*unused*/) {}
+    inline void openTimingRegion(const DeviceStream& /*unused*/) {}
     /*! \brief Should be called after the region end. */
-    inline void closeTimingRegion(CommandStream /*unused*/) {}
+    inline void closeTimingRegion(const DeviceStream& /*unused*/) {}
     /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */
     inline double getLastRangeTime()
     {
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh
index b477cdcb4c..ec3424a8f4 100644
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -45,6 +45,8 @@
  * \ingroup module_gpu_utils
  */
 
+#include "gromacs/gpu_utils/device_stream.h"
+
 /*! \brief CUDA device information.
  *
  * The CUDA device information is queried and set at detection and contains
@@ -61,8 +63,6 @@ struct DeviceInformation
     int stat;
 };
 
-//! \brief GPU command stream
-using CommandStream = cudaStream_t;
 //! \brief Single GPU call timing event - meaningless in CUDA
 using CommandEvent = void;
 
@@ -73,10 +73,10 @@ using CommandEvent = void;
  */
 struct KernelLaunchConfig
 {
-    size_t        gridSize[3]      = { 1, 1, 1 }; //!< Block counts
-    size_t        blockSize[3]     = { 1, 1, 1 }; //!< Per-block thread counts
-    size_t        sharedMemorySize = 0;           //!< Shared memory size in bytes
-    CommandStream stream           = nullptr;     //!< Stream to launch kernel in
+    size_t       gridSize[3]      = { 1, 1, 1 }; //!< Block counts
+    size_t       blockSize[3]     = { 1, 1, 1 }; //!< Per-block thread counts
+    size_t       sharedMemorySize = 0;           //!< Shared memory size in bytes
+    cudaStream_t stream           = nullptr;     //!< Stream to launch kernel in
 };
 
 //! Sets whether device code can use arrays that are embedded in structs.
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h
index a36a5cc3bc..5fec00303a 100644
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -63,8 +63,6 @@ struct DeviceInformation
     // No member needed
 };
 
-//! \brief GPU command stream
-using CommandStream = void*;
 //! \brief Single GPU call timing event
 using CommandEvent = void*;
 
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h
index caf837552a..0438c084d1 100644
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -79,8 +79,6 @@ struct DeviceInformation
     size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE).
 };
 
-//! \brief GPU command stream
-using CommandStream = cl_command_queue;
 //! \brief Single GPU call timing event
 using CommandEvent = cl_event;
 
@@ -91,10 +89,10 @@ using CommandEvent = cl_event;
  */
 struct KernelLaunchConfig
 {
-    size_t        gridSize[3]      = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
-    size_t        blockSize[3]     = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
-    size_t        sharedMemorySize = 0;           //!< Shared memory size in bytes
-    CommandStream stream           = nullptr;     //!< Stream to launch kernel in
+    size_t           gridSize[3]      = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
+    size_t           blockSize[3]     = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
+    size_t           sharedMemorySize = 0;           //!< Shared memory size in bytes
+    cl_command_queue stream           = nullptr;     //!< Stream to launch kernel in
 };
 
 /*! \brief Sets whether device code can use arrays that are embedded in structs.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 90f5b04bfe..5e36d91ffd 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -45,6 +45,7 @@
 #include <string>
 
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gmxopencl.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/utility/exceptions.h"
@@ -130,17 +131,6 @@ void pfree(void* h_ptr);
 /*! \brief Convert error code to diagnostic string */
 std::string ocl_get_error_string(cl_int error);
 
-/*! \brief Calls clFinish() in the stream \p s.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cl_command_queue s)
-{
-    cl_int cl_error = clFinish(s);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
-                       ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
-}
-
 //! A debug checker to track cl_events being released correctly
 inline void ensureReferenceCount(const cl_event& event, unsigned int refCount)
 {
@@ -156,11 +146,9 @@ inline void ensureReferenceCount(const cl_event& event, unsigned int refCount)
 
 /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
  *
- * \param[in] s queue to check
- *
- *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ *  \returns  Not implemented in OpenCL.
  */
-static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */)
 {
     GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
     return false;
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
index e16dd8ebf5..682035bb5f 100644
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -112,12 +112,13 @@ void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
 
     const int numElements = h_rVecInput.size();
 
     DeviceBuffer<RVec> d_rVecInput;
     allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
-    copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
+    copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, deviceStream,
                        GpuApiCallBehavior::Sync, nullptr);
 
     DeviceBuffer<float3> d_float3Output;
@@ -131,14 +132,14 @@ void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std
     kernelLaunchConfig.blockSize[1]     = 1;
     kernelLaunchConfig.blockSize[2]     = 1;
     kernelLaunchConfig.sharedMemorySize = 0;
-    kernelLaunchConfig.stream           = nullptr;
+    kernelLaunchConfig.stream           = deviceStream.stream();
 
     auto       kernelPtr  = convertRVecToFloat3OnDevice_kernel;
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig,
                                                       &d_float3Output, &d_rVecInput, &numElements);
     launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs);
 
-    copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, nullptr,
+    copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream,
                          GpuApiCallBehavior::Sync, nullptr);
 
     saveFloat3InRVecFormat(h_rVecOutput, h_float3Output.data(), numElements);
diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h
index b1c69d4572..e2c114ce4b 100644
--- a/src/gromacs/listed_forces/gpubonded.h
+++ b/src/gromacs/listed_forces/gpubonded.h
@@ -56,6 +56,8 @@
 #include "gromacs/utility/classhelpers.h"
 
 class DeviceContext;
+class DeviceStream;
+
 struct gmx_enerdata_t;
 struct gmx_ffparams_t;
 struct gmx_mtop_t;
@@ -109,7 +111,7 @@ public:
     //! Construct the manager with constant data and the stream to use.
     GpuBonded(const gmx_ffparams_t& ffparams,
               const DeviceContext&  deviceContext,
-              void*                 streamPtr,
+              const DeviceStream&   deviceStream,
               gmx_wallcycle*        wcycle);
     //! Destructor
     ~GpuBonded();
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp
index f24103229b..ec8e43323e 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -162,7 +162,7 @@ class GpuBonded::Impl
 
 GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */,
                      const DeviceContext& /* deviceContext */,
-                     void* /*streamPtr */,
+                     const DeviceStream& /* deviceStream */,
                      gmx_wallcycle* /* wcycle */) :
     impl_(nullptr)
 {
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index 763550c5c9..0d5367f698 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -66,22 +66,22 @@ namespace gmx
 
 GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
                       const DeviceContext&  deviceContext,
-                      void*                 streamPtr,
+                      const DeviceStream&   deviceStream,
                       gmx_wallcycle*        wcycle) :
-    deviceContext_(deviceContext)
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream)
 {
-    stream_ = *static_cast<CommandStream*>(streamPtr);
     wcycle_ = wcycle;
 
     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
     // This could be an async transfer (if the source is pinned), so
     // long as it uses the same stream as the kernels and we are happy
     // to consume additional pinned pages.
-    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
-                       GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(),
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     vTot_.resize(F_NRE);
     allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
 
     kernelParams_.d_forceParams = d_forceParams_;
     kernelParams_.d_xq          = d_xq_;
@@ -212,7 +212,7 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
             reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
                                    deviceContext_);
 
-            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
+            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), deviceStream_,
                                GpuApiCallBehavior::Async, nullptr);
         }
         kernelParams_.fTypesOnGpu[fTypesCounter]    = fType;
@@ -270,7 +270,7 @@ void GpuBonded::Impl::launchEnergyTransfer()
     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
     // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
     float* h_vTot = vTot_.data();
-    copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, stream_, GpuApiCallBehavior::Async, nullptr);
+    copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
 }
 
@@ -281,7 +281,7 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
                "accumulation should not occur");
 
     wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
-    cudaError_t stat = cudaStreamSynchronize(stream_);
+    cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
     wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
 
@@ -304,7 +304,7 @@ void GpuBonded::Impl::clearEnergies()
 {
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
@@ -313,9 +313,9 @@ void GpuBonded::Impl::clearEnergies()
 
 GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams,
                      const DeviceContext&  deviceContext,
-                     void*                 streamPtr,
+                     const DeviceStream&   deviceStream,
                      gmx_wallcycle*        wcycle) :
-    impl_(new Impl(ffparams, deviceContext, streamPtr, wcycle))
+    impl_(new Impl(ffparams, deviceContext, deviceStream, wcycle))
 {
 }
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h
index a0da918893..32cce2599a 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -126,7 +126,10 @@ class GpuBonded::Impl
 {
 public:
     //! Constructor
-    Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, void* streamPtr, gmx_wallcycle* wcycle);
+    Impl(const gmx_ffparams_t& ffparams,
+         const DeviceContext&  deviceContext,
+         const DeviceStream&   deviceStream,
+         gmx_wallcycle*        wcycle);
     /*! \brief Destructor, non-default needed for freeing
      * device-side buffers */
     ~Impl();
@@ -183,7 +186,7 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! \brief Bonded GPU stream, not owned by this module
-    CommandStream stream_;
+    const DeviceStream& deviceStream_;
 
     //! Parameters and pointers, passed to the CUDA kernel
     BondedCudaKernelParameters kernelParams_;
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu
index 166e5bb87d..e03a3f1fa1 100644
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -850,7 +850,7 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
     config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
     config.gridSize[1]  = 1;
     config.gridSize[2]  = 1;
-    config.stream       = stream_;
+    config.stream       = deviceStream_.stream();
 
     auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
     kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu
index b77162c1af..a7e19c922d 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -287,7 +287,7 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
                 h_lambdas_[i] = tcstat[i].lambda;
             }
             copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
-                               commandStream_, GpuApiCallBehavior::Async, nullptr);
+                               deviceStream_, GpuApiCallBehavior::Async, nullptr);
         }
         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
         if (doParrinelloRahman)
@@ -316,9 +316,9 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
     return;
 }
 
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
     deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
 {
     numAtoms_ = 0;
 
@@ -328,7 +328,7 @@ LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream comma
     kernelLaunchConfig_.blockSize[1]     = 1;
     kernelLaunchConfig_.blockSize[2]     = 1;
     kernelLaunchConfig_.sharedMemorySize = 0;
-    kernelLaunchConfig_.stream           = commandStream_;
+    kernelLaunchConfig_.stream           = deviceStream_.stream();
 }
 
 LeapFrogGpu::~LeapFrogGpu()
@@ -345,7 +345,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
 
     reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
                            &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
+    copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, deviceStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 
     // Temperature scale group map only used if there are more then one group
@@ -353,7 +353,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
     {
         reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
                                &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
+        copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_,
                            GpuApiCallBehavior::Sync, nullptr);
     }
 
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh
index 26a6fc7399..6097a9a87e 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cuh
+++ b/src/gromacs/mdlib/leapfrog_gpu.cuh
@@ -64,9 +64,9 @@ public:
     /*! \brief Constructor.
      *
      * \param[in] deviceContext  Device context (dummy in CUDA).
-     * \param[in] commandStream  Device command stream to use.
+     * \param[in] deviceStream   Device stream to use.
      */
-    LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
+    LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
     ~LeapFrogGpu();
 
     /*! \brief Integrate
@@ -115,7 +115,7 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
     //! GPU kernel launch config
     KernelLaunchConfig kernelLaunchConfig_;
     //! Number of atoms
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu
index 03c1bd1d15..61bc717852 100644
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -447,7 +447,7 @@ void LincsGpu::apply(const float3* d_x,
     {
         // Fill with zeros so the values can be reduced to it
         // Only 6 values are needed because virial is symmetrical
-        clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_);
+        clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
     }
 
     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
@@ -475,7 +475,7 @@ void LincsGpu::apply(const float3* d_x,
     {
         config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
     }
-    config.stream = commandStream_;
+    config.stream = deviceStream_.stream();
 
     kernelParams_.pbcAiuc = pbcAiuc;
 
@@ -488,7 +488,7 @@ void LincsGpu::apply(const float3* d_x,
     {
         // Copy LINCS virial data and add it to the common virial
         copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
-                             commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                             deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 
         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
         virialScaled[XX][XX] += h_virialScaled_[0];
@@ -510,9 +510,9 @@ void LincsGpu::apply(const float3* d_x,
 LincsGpu::LincsGpu(int                  numIterations,
                    int                  expansionOrder,
                    const DeviceContext& deviceContext,
-                   CommandStream        commandStream) :
+                   const DeviceStream&  deviceStream) :
     deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
 {
     kernelParams_.numIterations  = numIterations;
     kernelParams_.expansionOrder = expansionOrder;
@@ -943,23 +943,23 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
 
     // Copy data to GPU.
     copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
-                       kernelParams_.numConstraintsThreads, commandStream_,
-                       GpuApiCallBehavior::Sync, nullptr);
+                       kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync,
+                       nullptr);
     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
                        constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
                        coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
                        0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
-                       commandStream_, GpuApiCallBehavior::Sync, nullptr);
+                       deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
-                       maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
+                       maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 
     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
-    copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
+    copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, deviceStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 }
 
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh
index 4817573b80..ef03516431 100644
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -105,9 +105,12 @@ public:
      * \param[in] numIterations    Number of iteration for the correction of the projection.
      * \param[in] expansionOrder   Order of the matrix inversion algorithm.
      * \param[in] deviceContext    Device context (dummy in CUDA).
-     * \param[in] commandStream    Device command stream.
+     * \param[in] deviceStream     Device command stream.
      */
-    LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
+    LincsGpu(int                  numIterations,
+             int                  expansionOrder,
+             const DeviceContext& deviceContext,
+             const DeviceStream&  deviceStream);
     /*! \brief Destructor.*/
     ~LincsGpu();
 
@@ -172,7 +175,7 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
 
     //! Parameters and pointers, passed to the GPU kernel
     LincsGpuKernelParameters kernelParams_;
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu
index 20933baf96..76daf34c1a 100644
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -434,7 +434,7 @@ void SettleGpu::apply(const float3* d_x,
     {
         // Fill with zeros so the values can be reduced to it
         // Only 6 values are needed because virial is symmetrical
-        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, commandStream_);
+        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_);
     }
 
     auto kernelPtr = getSettleKernelPtr(updateVelocities, computeVirial);
@@ -455,7 +455,7 @@ void SettleGpu::apply(const float3* d_x,
     {
         config.sharedMemorySize = 0;
     }
-    config.stream = commandStream_;
+    config.stream = deviceStream_.stream();
 
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_,
                                                       &settleParameters_, &d_x, &d_xp, &invdt, &d_v,
@@ -465,7 +465,7 @@ void SettleGpu::apply(const float3* d_x,
 
     if (computeVirial)
     {
-        copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, commandStream_,
+        copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_,
                              GpuApiCallBehavior::Sync, nullptr);
 
         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
@@ -485,9 +485,9 @@ void SettleGpu::apply(const float3* d_x,
     return;
 }
 
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
     deviceContext_(deviceContext),
-    commandStream_(commandStream)
+    deviceStream_(deviceStream)
 {
     static_assert(sizeof(real) == sizeof(float),
                   "Real numbers should be in single precision in GPU code.");
@@ -622,7 +622,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus
         settler.z        = iatoms[i * nral1 + 3]; // Second hydrogen index
         h_atomIds_.at(i) = settler;
     }
-    copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, commandStream_,
+    copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 }
 
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh
index da8bafd8df..24584f7a4b 100644
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -202,9 +202,9 @@ public:
      *                           target O-H and H-H distances. These values are also checked for
      *                           consistency.
      * \param[in] deviceContext  Device context (dummy in CUDA).
-     * \param[in] commandStream  Device stream to use.
+     * \param[in] deviceStream   Device stream to use.
      */
-    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
+    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
 
     ~SettleGpu();
 
@@ -255,7 +255,7 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
-    CommandStream commandStream_;
+    const DeviceStream& deviceStream_;
 
     //! Scaled virial tensor (9 reals, GPU)
     std::vector<float> h_virialScaled_;
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu
index 5c0a007ee4..00672af606 100644
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -72,9 +72,10 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
 
     auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
-                                               deviceContext, nullptr);
+                                               deviceContext, deviceStream);
 
     bool    updateVelocities = true;
     int     numAtoms         = testData->numAtoms_;
@@ -88,23 +89,23 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
     allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
-    copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
+    copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream,
                        GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, nullptr,
+    copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream,
                        GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, nullptr,
+        copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream,
                            GpuApiCallBehavior::Sync, nullptr);
     }
     lincsGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_,
                     testData->virialScaled_, pbcAiuc);
 
-    copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, nullptr,
+    copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream,
                          GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, nullptr,
+        copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream,
                              GpuApiCallBehavior::Sync, nullptr);
     }
 
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
index b794149ddb..7b2e22aac2 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
@@ -68,6 +68,7 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
 
     int numAtoms = testData->numAtoms_;
 
@@ -83,12 +84,12 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
     allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
 
-    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
 
-    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
+    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, deviceStream);
 
     integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
 
@@ -104,8 +105,8 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
                               testData->dtPressureCouple_, testData->velocityScalingMatrix_);
     }
 
-    copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
 
     freeDeviceBuffer(&d_x);
     freeDeviceBuffer(&d_xp);
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu
index 6ebc6688da..741d2951aa 100644
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -88,8 +88,9 @@ void applySettleGpu(SettleTestData*  testData,
 
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
+    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
 
-    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
+    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, deviceStream);
 
     settleGpu->set(*testData->idef_, testData->mdatoms_);
     PbcAiuc pbcAiuc;
@@ -107,19 +108,20 @@ void applySettleGpu(SettleTestData*  testData,
     allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
-    copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+        copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     }
     settleGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial,
                      testData->virial_, pbcAiuc);
 
-    copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+        copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, deviceStream,
+                             GpuApiCallBehavior::Sync, nullptr);
     }
 
     freeDeviceBuffer(&d_x);
diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h
index 61f8537efa..c0a5e9b21a 100644
--- a/src/gromacs/mdlib/update_constrain_gpu.h
+++ b/src/gromacs/mdlib/update_constrain_gpu.h
@@ -50,8 +50,8 @@
 #include "gromacs/utility/classhelpers.h"
 
 class DeviceContext;
+class DeviceStream;
 class GpuEventSynchronizer;
-
 struct gmx_mtop_t;
 enum class PbcType : int;
 class InteractionDefinitions;
@@ -68,7 +68,7 @@ class UpdateConstrainGpu
 public:
     /*! \brief Create Update-Constrain object.
      *
-     * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+     * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
      * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
      * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
      * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
@@ -79,13 +79,13 @@ public:
      * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
      *                              and target O-H and H-H distances from this object.
      * \param[in] deviceContext     GPU device context.
-     * \param[in] commandStream     GPU stream to use. Can be nullptr.
+     * \param[in] deviceStream      GPU stream to use.
      * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
      */
     UpdateConstrainGpu(const t_inputrec&     ir,
                        const gmx_mtop_t&     mtop,
                        const DeviceContext&  deviceContext,
-                       const void*           commandStream,
+                       const DeviceStream&   deviceStream,
                        GpuEventSynchronizer* xUpdatedOnDevice);
 
     ~UpdateConstrainGpu();
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
index 45a0743384..76f3a0eedf 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -58,7 +58,7 @@ class UpdateConstrainGpu::Impl
 UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
                                        const gmx_mtop_t& /* mtop */,
                                        const DeviceContext& /* deviceContext */,
-                                       const void* /* commandStream */,
+                                       const DeviceStream& /* deviceStream */,
                                        GpuEventSynchronizer* /* xUpdatedOnDevice */) :
     impl_(nullptr)
 {
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index 41f7572332..eed9e44d63 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -119,7 +119,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
     clear_mat(virial);
 
     // Make sure that the forces are ready on device before proceeding with the update.
-    fReadyOnDevice->enqueueWaitEvent(commandStream_);
+    fReadyOnDevice->enqueueWaitEvent(deviceStream_);
 
     // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
     // into d_x_. The d_xp_ is only needed by constraints.
@@ -141,7 +141,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
         }
     }
 
-    coordinatesReady_->markEvent(commandStream_);
+    coordinatesReady_->markEvent(deviceStream_);
 
     return;
 }
@@ -162,31 +162,30 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
                     "scaleCoordinates_kernel", kernelArgs);
     // TODO: Although this only happens on the pressure coupling steps, this synchronization
     //       can affect the perfornamce if nstpcouple is small.
-    gpuStreamSynchronize(commandStream_);
+    deviceStream_.synchronize();
 }
 
 UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                                const gmx_mtop_t&     mtop,
                                const DeviceContext&  deviceContext,
-                               const void*           commandStream,
+                               const DeviceStream&   deviceStream,
                                GpuEventSynchronizer* xUpdatedOnDevice) :
     deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
     coordinatesReady_(xUpdatedOnDevice)
 {
     GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
-    commandStream != nullptr ? commandStream_ = *static_cast<const CommandStream*>(commandStream)
-                             : commandStream_ = nullptr;
 
 
-    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
-    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
-    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
+    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, deviceStream_);
+    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_);
+    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, deviceStream_);
 
     coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
     coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
     coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
     coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
-    coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
+    coordinateScalingKernelLaunchConfig_.stream           = deviceStream_.stream();
 }
 
 UpdateConstrainGpu::Impl::~Impl() {}
@@ -235,9 +234,9 @@ GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync()
 UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
                                        const gmx_mtop_t&     mtop,
                                        const DeviceContext&  deviceContext,
-                                       const void*           commandStream,
+                                       const DeviceStream&   deviceStream,
                                        GpuEventSynchronizer* xUpdatedOnDevice) :
-    impl_(new Impl(ir, mtop, deviceContext, commandStream, xUpdatedOnDevice))
+    impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice))
 {
 }
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index dd46010e93..8aacc28a61 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -65,7 +65,7 @@ class UpdateConstrainGpu::Impl
 public:
     /*! \brief Create Update-Constrain object.
      *
-     * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+     * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
      * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
      * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
      * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
@@ -76,13 +76,13 @@ public:
      * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
      *                              and target O-H and H-H distances from this object.
      * \param[in] deviceContext     GPU device context.
-     * \param[in] commandStream     GPU stream to use. Can be nullptr.
+     * \param[in] deviceStream      GPU stream to use.
      * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
      */
     Impl(const t_inputrec&     ir,
          const gmx_mtop_t&     mtop,
          const DeviceContext&  deviceContext,
-         const void*           commandStream,
+         const DeviceStream&   deviceStream,
          GpuEventSynchronizer* xUpdatedOnDevice);
 
     ~Impl();
@@ -171,7 +171,7 @@ private:
     //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
-    CommandStream commandStream_ = nullptr;
+    const DeviceStream& deviceStream_;
     //! GPU kernel launch config
     KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
 
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 941a7030c9..91360ccd28 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -403,9 +403,10 @@ void gmx::LegacySimulator::do_md()
 
         GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
                            "GPU device context should be initialized to use GPU update.");
-
+        GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
+                           "Update stream can not be nullptr when update is on a GPU.");
         integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
-                                                          stateGpu->getUpdateStream(),
+                                                          *stateGpu->getUpdateStream(),
                                                           stateGpu->xUpdatedOnDevice());
 
         integrator->setPbc(PbcType::Xyz, state->box);
@@ -867,14 +868,20 @@ void gmx::LegacySimulator::do_md()
                     && useGpuForNonbonded && is1D(*cr->dd))
                 {
                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-                    void* streamLocal =
+                    const DeviceStream* localStream =
                             Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-                    void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+                    const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
                             fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
                     GMX_RELEASE_ASSERT(
                             fr->deviceContext != nullptr,
                             "GPU device context should be initialized to use GPU halo exchange.");
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal);
+                    GMX_RELEASE_ASSERT(localStream != nullptr,
+                                       "Local non-bonded stream can't be nullptr when using GPU "
+                                       "halo exchange.");
+                    GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
+                                       "Non-local non-bonded stream can't be nullptr when using "
+                                       "GPU halo exchange.");
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
                 }
             }
         }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 96e157ca07..753a43ab35 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1373,7 +1373,9 @@ int Mdrunner::mdrunner()
             GMX_RELEASE_ASSERT(
                     fr->deviceContext != nullptr,
                     "Device context can not be nullptr when computing bonded interactions on GPU.");
-            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, stream, wcycle);
+            GMX_RELEASE_ASSERT(stream != nullptr,
+                               "Can'r run GPU version of bonded forces in nullptr stream.");
+            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
             fr->gpuBonded = gpuBonded.get();
         }
 
@@ -1584,12 +1586,12 @@ int Mdrunner::mdrunner()
             && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
                 || runScheduleWork.simulationWork.useGpuBufferOps))
         {
-            const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
-            const void* localStream =
+            const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
+            const DeviceStream* localStream =
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
                             : nullptr;
-            const void* nonLocalStream =
+            const DeviceStream* nonLocalStream =
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                             : nullptr;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 034e7eb604..678fa33681 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -60,6 +60,7 @@
 #include "locality.h"
 
 class DeviceContext;
+class DeviceStream;
 class GpuEventSynchronizer;
 struct gmx_wallcycle;
 
@@ -99,9 +100,6 @@ public:
      *       \p pmeStream argument needs to be a valid OpenCL queue object
      *       which must have been created in \p deviceContext.
      *
-     * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-     *       will not have to pass a void*.
-     *
      *  \param[in] pmeStream       Device PME stream, nullptr allowed.
      *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
@@ -110,9 +108,9 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const void*          pmeStream,
-                           const void*          localStream,
-                           const void*          nonLocalStream,
+    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
+                           const DeviceStream*  localStream,
+                           const DeviceStream*  nonLocalStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
                            int                  paddingSize,
@@ -134,7 +132,7 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const void*          pmeStream,
+    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                            const DeviceContext& deviceContext,
                            GpuApiCallBehavior   transferKind,
                            int                  paddingSize,
@@ -329,7 +327,7 @@ public:
      *
      *  \returns The device command stream to use in update-constraints.
      */
-    void* getUpdateStream();
+    const DeviceStream* getUpdateStream();
 
     /*! \brief Getter for the number of local atoms.
      *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index 1029dd220f..78b1fd3a4a 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -54,9 +54,9 @@ class StatePropagatorDataGpu::Impl
 {
 };
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
-                                               const void* /* localStream     */,
-                                               const void* /* nonLocalStream  */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
+                                               const DeviceStream* /* localStream     */,
+                                               const DeviceStream* /* nonLocalStream  */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* paddingSize     */,
@@ -65,7 +65,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */
 {
 }
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
                                                const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* paddingSize     */,
@@ -242,7 +242,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality /* atomLocality
 }
 
 
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index 679bf2544a..1b2c91d2e2 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -99,9 +99,6 @@ public:
      *       \p pmeStream argument needs to be a valid OpenCL queue object
      *       which must have been created in \p deviceContext.
      *
-     * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-     *       will not have to pass a void*.
-     *
      *  \param[in] pmeStream       Device PME stream, nullptr allowed.
      *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
      *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
@@ -110,9 +107,9 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    Impl(const void*          pmeStream,
-         const void*          localStream,
-         const void*          nonLocalStream,
+    Impl(const DeviceStream*  pmeStream,
+         const DeviceStream*  localStream,
+         const DeviceStream*  nonLocalStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
          int                  paddingSize,
@@ -134,7 +131,7 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    Impl(const void*          pmeStream,
+    Impl(const DeviceStream*  pmeStream,
          const DeviceContext& deviceContext,
          GpuApiCallBehavior   transferKind,
          int                  paddingSize,
@@ -325,7 +322,7 @@ public:
      *
      *  \returns The device command stream to use in update-constraints.
      */
-    void* getUpdateStream();
+    const DeviceStream* getUpdateStream();
 
     /*! \brief Getter for the number of local atoms.
      *
@@ -341,20 +338,23 @@ public:
 
 private:
     //! GPU PME stream.
-    CommandStream pmeStream_ = nullptr;
+    const DeviceStream* pmeStream_;
     //! GPU NBNXM local stream.
-    CommandStream localStream_ = nullptr;
-    //! GPU NBNXM non-local stream
-    CommandStream nonLocalStream_ = nullptr;
+    const DeviceStream* localStream_;
+    //! GPU NBNXM non-local stream.
+    const DeviceStream* nonLocalStream_;
     //! GPU Update-constreaints stream.
-    CommandStream updateStream_ = nullptr;
+    const DeviceStream* updateStream_;
+
+    //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
+    DeviceStream updateStreamOwn_;
 
     // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
     // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> vCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
     // Streams to use for forces H2D and D2H copies (one event for each atom locality)
-    EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = { { nullptr } };
+    EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
 
     /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
      *
@@ -422,13 +422,13 @@ private:
      *  \param[in]  h_data         Host-side buffer.
      *  \param[in]  dataSize       Device-side data allocation size.
      *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-     *  \param[in]  commandStream  GPU stream to execute copy in.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
      */
     void copyToDevice(DeviceBuffer<RVec>             d_data,
                       gmx::ArrayRef<const gmx::RVec> h_data,
                       int                            dataSize,
                       AtomLocality                   atomLocality,
-                      CommandStream                  commandStream);
+                      const DeviceStream&            deviceStream);
 
     /*! \brief Performs the copy of data from device to host buffer.
      *
@@ -436,13 +436,13 @@ private:
      *  \param[in]  d_data         Device-side buffer.
      *  \param[in]  dataSize       Device-side data allocation size.
      *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-     *  \param[in]  commandStream  GPU stream to execute copy in.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
      */
     void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
                         DeviceBuffer<RVec>       d_data,
                         int                      dataSize,
                         AtomLocality             atomLocality,
-                        CommandStream            commandStream);
+                        const DeviceStream&      deviceStream);
 };
 
 } // namespace gmx
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index d88f469711..b1fefd34a5 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -65,9 +65,9 @@
 namespace gmx
 {
 
-StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
-                                   const void*          localStream,
-                                   const void*          nonLocalStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
+                                   const DeviceStream*  localStream,
+                                   const DeviceStream*  nonLocalStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
                                    int                  paddingSize,
@@ -86,8 +86,8 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
         GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
 
         // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_    = *static_cast<const CommandStream*>(pmeStream);
-        updateStream_ = *static_cast<const CommandStream*>(pmeStream);
+        pmeStream_    = pmeStream;
+        updateStream_ = pmeStream;
         GMX_UNUSED_VALUE(localStream);
         GMX_UNUSED_VALUE(nonLocalStream);
     }
@@ -96,21 +96,24 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
     {
         if (pmeStream != nullptr)
         {
-            pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+            pmeStream_ = pmeStream;
         }
         if (localStream != nullptr)
         {
-            localStream_ = *static_cast<const CommandStream*>(localStream);
+            localStream_ = localStream;
         }
         if (nonLocalStream != nullptr)
         {
-            nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
+            nonLocalStream_ = nonLocalStream;
         }
 
         // TODO: The update stream should be created only when it is needed.
 #    if (GMX_GPU == GMX_GPU_CUDA)
-        cudaError_t stat;
-        stat = cudaStreamCreate(&updateStream_);
+        cudaError_t  stat;
+        cudaStream_t stream;
+        stat = cudaStreamCreate(&stream);
+        updateStreamOwn_.setStream(stream);
+        updateStream_ = &updateStreamOwn_;
         CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
 #    endif
     }
@@ -131,7 +134,7 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
     fCopyStreams_[AtomLocality::All]      = updateStream_;
 }
 
-StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
                                    const DeviceContext& deviceContext,
                                    GpuApiCallBehavior   transferKind,
                                    int                  paddingSize,
@@ -145,9 +148,8 @@ StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
                   "This object should only be constructed on the GPU code-paths.");
 
     GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
-    pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
-
-    localStream_    = nullptr;
+    pmeStream_      = pmeStream;
+    localStream_    = pmeStream; // For clearing the force buffer
     nonLocalStream_ = nullptr;
     updateStream_   = nullptr;
 
@@ -193,7 +195,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     if (paddingAllocationSize > 0)
     {
         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
-        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
+        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
     }
 
     reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
@@ -204,7 +206,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     // since the force buffer ops are not implemented in OpenCL.
     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
     {
-        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -247,7 +249,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
                                                 int                                  dataSize,
                                                 AtomLocality                         atomLocality,
-                                                CommandStream                        commandStream)
+                                                const DeviceStream&                  deviceStream)
 {
     GMX_UNUSED_VALUE(dataSize);
 
@@ -255,7 +257,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
 
-    GMX_ASSERT(commandStream != nullptr,
+    GMX_ASSERT(deviceStream.stream() != nullptr,
                "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -271,7 +273,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
                    "The host buffer is smaller than the requested copy range.");
 
         copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
-                           atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+                           atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -282,7 +284,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
                                                   DeviceBuffer<RVec>       d_data,
                                                   int                      dataSize,
                                                   AtomLocality             atomLocality,
-                                                  CommandStream            commandStream)
+                                                  const DeviceStream&      deviceStream)
 {
     GMX_UNUSED_VALUE(dataSize);
 
@@ -290,7 +292,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
 
-    GMX_ASSERT(commandStream != nullptr,
+    GMX_ASSERT(deviceStream.stream() != nullptr,
                "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -306,7 +308,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
                    "The host buffer is smaller than the requested copy range.");
 
         copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
-                             atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+                             atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -322,14 +324,14 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
                                                         AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = xCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying positions with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
 
     // markEvent is skipped in OpenCL as:
     //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
@@ -337,7 +339,7 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     // TODO: remove this by adding an event-mark free flavor of this function
     if (GMX_GPU == GMX_GPU_CUDA)
     {
-        xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
+        xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -387,16 +389,16 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = xCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying positions with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
-    xReadyOnHost_[atomLocality].markEvent(commandStream);
+    xReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -419,15 +421,15 @@ void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const
                                                        AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = vCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying velocities with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
-    vReadyOnDevice_[atomLocality].markEvent(commandStream);
+    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
+    vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -442,15 +444,15 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEv
 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = vCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying velocities with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
-    vReadyOnHost_[atomLocality].markEvent(commandStream);
+    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
+    vReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -473,15 +475,15 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
                                                    AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = fCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying forces with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
-    fReadyOnDevice_[atomLocality].markEvent(commandStream);
+    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
+    fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -508,15 +510,15 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
 {
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    CommandStream commandStream = fCopyStreams_[atomLocality];
-    GMX_ASSERT(commandStream != nullptr,
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying forces with given atom locality.");
 
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
-    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
-    fReadyOnHost_[atomLocality].markEvent(commandStream);
+    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
+    fReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
@@ -529,9 +531,9 @@ void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocali
     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 }
 
-void* StatePropagatorDataGpu::Impl::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
 {
-    return &updateStream_;
+    return updateStream_;
 }
 
 int StatePropagatorDataGpu::Impl::numAtomsLocal()
@@ -545,9 +547,9 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
 }
 
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
-                                               const void*          localStream,
-                                               const void*          nonLocalStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
+                                               const DeviceStream*  localStream,
+                                               const DeviceStream*  nonLocalStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
                                                int                  paddingSize,
@@ -556,7 +558,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
 {
 }
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
                                                const DeviceContext& deviceContext,
                                                GpuApiCallBehavior   transferKind,
                                                int                  paddingSize,
@@ -682,7 +684,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
 }
 
 
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
 {
     return impl_->getUpdateStream();
 }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index e47834cb9a..54ce9f331b 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
  */
 void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
-    cudaStream_t stream = nb->stream[interactionLocality];
+    const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
 
     /* When we get here all misc operations issued in the local stream as well as
        the local xq H2D are done,
@@ -378,12 +378,13 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal
     {
         if (interactionLocality == InteractionLocality::Local)
         {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
         }
         else
         {
-            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+            cudaError_t stat =
+                    cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
         }
     }
@@ -401,10 +402,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -440,16 +441,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
     cu_copy_H2D_async(adat->xq + adat_begin,
                       static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
-                      adat_len * sizeof(*adat->xq), stream);
+                      adat_len * sizeof(*adat->xq), deviceStream.stream());
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
     }
 
     /* When we get here all misc operations issued in the local stream as well as
@@ -480,11 +481,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  */
 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
 {
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_nbparam_t*  nbp    = nb->nbparam;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_nbparam_t*       nbp          = nb->nbparam;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -522,7 +523,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
     }
 
     /* Kernel launch config:
@@ -544,7 +545,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     config.blockSize[2]     = num_threads_z;
     config.gridSize[0]      = nblock;
     config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
 
     if (debug)
     {
@@ -567,13 +568,13 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
 
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
     }
 
     if (GMX_NATIVE_WINDOWS)
     {
         /* Windows: force flushing WDDM queue */
-        cudaStreamQuery(stream);
+        cudaStreamQuery(deviceStream.stream());
     }
 }
 
@@ -592,11 +593,11 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
 
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
-    cu_atomdata_t* adat   = nb->atdat;
-    cu_nbparam_t*  nbp    = nb->nbparam;
-    cu_plist_t*    plist  = nb->plist[iloc];
-    cu_timers_t*   t      = nb->timers;
-    cudaStream_t   stream = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_nbparam_t*       nbp          = nb->nbparam;
+    cu_plist_t*         plist        = nb->plist[iloc];
+    cu_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -652,7 +653,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     /* beginning of timed prune calculation section */
     if (bDoTime)
     {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
     }
 
     /* Kernel launch config:
@@ -668,7 +669,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     config.blockSize[2]     = num_threads_z;
     config.gridSize[0]      = nblock;
     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
 
     if (debug)
     {
@@ -704,13 +705,13 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
 
     if (bDoTime)
     {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
     }
 
     if (GMX_NATIVE_WINDOWS)
     {
         /* Windows: force flushing WDDM queue */
-        cudaStreamQuery(stream);
+        cudaStreamQuery(deviceStream.stream());
     }
 }
 
@@ -728,10 +729,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
     /* extract the data */
-    cu_atomdata_t* adat    = nb->atdat;
-    cu_timers_t*   t       = nb->timers;
-    bool           bDoTime = nb->bDoTime;
-    cudaStream_t   stream  = nb->stream[iloc];
+    cu_atomdata_t*      adat         = nb->atdat;
+    cu_timers_t*        t            = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -744,14 +745,14 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
        kernel has finished. */
     if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
     {
-        stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
+        stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
     }
 
@@ -761,7 +762,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     if (!stepWork.useGpuFBufferOps)
     {
         cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
-                          (adat_len) * sizeof(*adat->f), stream);
+                          (adat_len) * sizeof(*adat->f), deviceStream.stream());
     }
 
     /* After the non-local D2H is launched the nonlocal_done event can be
@@ -770,7 +771,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
        back first. */
     if (iloc == InteractionLocality::NonLocal)
     {
-        stat = cudaEventRecord(nb->nonlocal_done, stream);
+        stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
     }
 
@@ -780,20 +781,21 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream);
+            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
+                              deviceStream.stream());
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream);
-            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), stream);
+            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
+            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
         }
     }
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
     }
 }
 
@@ -834,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     const int                  numAtomsPerCell = grid.numAtomsPerCell();
     Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
 
-    cudaStream_t stream = nb->stream[interactionLoc];
+    const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
 
     int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
     // avoid empty kernel launch, skip to inserting stream dependency
@@ -845,7 +847,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
 
         // ensure that coordinates are ready on the device before launching the kernel
         GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
-        xReadyOnDevice->enqueueWaitEvent(stream);
+        xReadyOnDevice->enqueueWaitEvent(deviceStream);
 
         KernelLaunchConfig config;
         config.blockSize[0] = c_bufOpsThreadsPerBlock;
@@ -858,7 +860,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         GMX_ASSERT(config.gridSize[0] > 0,
                    "Can not have empty grid, early return above avoids this");
         config.sharedMemorySize = 0;
-        config.stream           = stream;
+        config.stream           = deviceStream.stream();
 
         auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
                                         : nbnxn_gpu_x_to_nbat_x_kernel<false>;
@@ -873,7 +875,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
     }
 
-    // TODO: note that this is not necessary when there are no local atoms, that is:
+    // TODO: note that this is not necessary when there astreamre no local atoms, that is:
     // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
     // but for now we avoid that optimization
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
@@ -899,9 +901,9 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
     GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
 
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-    cu_atomdata_t*            adat      = nb->atdat;
+    const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
+    const DeviceStream&       deviceStream = nb->deviceStreams[iLocality];
+    cu_atomdata_t*            adat         = nb->atdat;
 
     size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
                                              + static_cast<size_t>((accumulateForce == true));
@@ -911,7 +913,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     // Enqueue wait on all dependencies passed
     for (auto const synchronizer : dependencyList)
     {
-        synchronizer->enqueueWaitEvent(stream);
+        synchronizer->enqueueWaitEvent(deviceStream);
     }
 
     /* launch kernel */
@@ -924,7 +926,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     config.gridSize[1]  = 1;
     config.gridSize[2]  = 1;
     config.sharedMemorySize = 0;
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
 
     auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
                                     : nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
@@ -950,7 +952,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     {
         GMX_ASSERT(nb->localFReductionDone != nullptr,
                    "localFReductionDone has to be a valid pointer");
-        nb->localFReductionDone->markEvent(stream);
+        nb->localFReductionDone->markEvent(deviceStream);
     }
 }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 666aefc629..cbd5f8ce01 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -448,7 +448,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
     nb->deviceInfo = deviceInfo;
 
     /* local/non-local GPU streams */
-    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+    cudaStream_t localStream;
+    stat = cudaStreamCreate(&localStream);
+    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
     CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
     if (nb->bUseTwoStreams)
     {
@@ -462,8 +464,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
         stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
         CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 
-        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
-                                            cudaStreamDefault, highest_priority);
+        cudaStream_t nonLocalStream;
+        stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
+        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
         CU_RET_ERR(stat,
                    "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
     }
@@ -512,10 +515,10 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
 
 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 {
-    char         sbuf[STRLEN];
-    bool         bDoTime = (nb->bDoTime && !h_plist->sci.empty());
-    cudaStream_t stream  = nb->stream[iloc];
-    cu_plist_t*  d_plist = nb->plist[iloc];
+    char                sbuf[STRLEN];
+    bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    cu_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
     {
@@ -535,18 +538,18 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
     if (bDoTime)
     {
-        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.pl_h2d.openTimingRegion(deviceStream);
         iTimers.didPairlistH2D = true;
     }
 
     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                            DeviceContext());
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
                            DeviceContext());
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
@@ -554,12 +557,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
                            &d_plist->excl_nalloc, DeviceContext());
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        iTimers.pl_h2d.closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(deviceStream);
     }
 
     /* the next use of thist list we be the first one, so we need to prune */
@@ -569,7 +572,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -584,7 +587,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -595,7 +598,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -618,13 +621,13 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 
 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 {
-    cudaError_t    stat;
-    int            nalloc, natoms;
-    bool           realloced;
-    bool           bDoTime = nb->bDoTime;
-    cu_timers_t*   timers  = nb->timers;
-    cu_atomdata_t* d_atdat = nb->atdat;
-    cudaStream_t   ls      = nb->stream[InteractionLocality::Local];
+    cudaError_t         stat;
+    int                 nalloc, natoms;
+    bool                realloced;
+    bool                bDoTime      = nb->bDoTime;
+    cu_timers_t*        timers       = nb->timers;
+    cu_atomdata_t*      d_atdat      = nb->atdat;
+    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -632,7 +635,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     if (bDoTime)
     {
         /* time async copy */
-        timers->atdat.openTimingRegion(ls);
+        timers->atdat.openTimingRegion(deviceStream);
     }
 
     /* need to reallocate if we have to copy more atoms than the amount of space
@@ -681,17 +684,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     if (useLjCombRule(nb->nbparam))
     {
         cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
-                          natoms * sizeof(*d_atdat->lj_comb), ls);
+                          natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
     }
     else
     {
         cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
-                          natoms * sizeof(*d_atdat->atom_types), ls);
+                          natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
     }
 
     if (bDoTime)
     {
-        timers->atdat.closeTimingRegion(ls);
+        timers->atdat.closeTimingRegion(deviceStream);
     }
 }
 
@@ -725,15 +728,6 @@ void gpu_free(NbnxmGpu* nb)
     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
 
     delete nb->timers;
-    if (nb->bDoTime)
-    {
-        /* The non-local counters/stream (second in the array) are needed only with DD. */
-        for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
-        {
-            stat = cudaStreamDestroy(nb->stream[i]);
-            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
-        }
-    }
 
     if (!useLjCombRule(nb->nbparam))
     {
@@ -822,11 +816,11 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
 }
 
-void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
 {
     assert(nb);
 
-    return static_cast<void*>(&nb->stream[iloc]);
+    return &nb->deviceStreams[iloc];
 }
 
 void* gpu_get_xq(NbnxmGpu* nb)
@@ -854,9 +848,9 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 {
-    cudaStream_t stream        = gpu_nbv->stream[InteractionLocality::Local];
-    bool         bDoTime       = gpu_nbv->bDoTime;
-    const int    maxNumColumns = gridSet.numColumnsMax();
+    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    bool                bDoTime       = gpu_nbv->bDoTime;
+    const int           maxNumColumns = gridSet.numColumnsMax();
 
     reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
                            &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
@@ -882,15 +876,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
             }
 
-            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream,
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream,
                                GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
             }
         }
 
@@ -898,28 +892,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
         {
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
             }
 
             int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
             }
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
             }
 
             destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
             }
         }
     }
@@ -943,7 +939,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                     GpuEventSynchronizer* const localReductionDone)
 {
 
-    cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
 
     GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
     gpu_nbv->localFReductionDone = localReductionDone;
@@ -952,7 +948,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
     {
         reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
                                DeviceContext());
-        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
+                           GpuApiCallBehavior::Async, nullptr);
     }
 
     return;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index da607e4429..d2bbfa6b8e 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -303,7 +303,7 @@ struct NbnxmGpu
     /*! \brief staging area where fshift/energies get downloaded */
     nb_staging_t nbst;
     /*! \brief local and non-local GPU streams */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream = { { nullptr } };
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index 65f38daea2..f5b3d813da 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 
-            if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+            if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
             {
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
@@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
         }
         else if (haveResultToWaitFor)
         {
-            gpuStreamSynchronize(nb->stream[iLocality]);
+            nb->deviceStreams[iLocality].synchronize();
         }
 
         // TODO: this needs to be moved later because conditional wait could brake timing
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index 822852786c..e242771862 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -51,6 +51,7 @@
 #include "gromacs/mdtypes/locality.h"
 
 class DeviceContext;
+class DeviceStream;
 
 struct NbnxmGpu;
 struct gmx_gpu_info_t;
@@ -126,7 +127,7 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu gmx_unused* nb) GPU_FUNC_
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
         CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 /** Returns an opaque pointer to the GPU coordinate+charge array
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 013bd093a0..ba0c2ee939 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -484,10 +484,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* local/nonlocal offset and length used for xq and f */
     int adat_begin, adat_len;
 
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -522,17 +522,17 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
     /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4,
-                       adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream,
+    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
+                       adat_len * sizeof(float) * 4, deviceStream.stream(),
                        bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
     }
 
     /* When we get here all misc operations issues in the local stream as well as
@@ -543,7 +543,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         if (iloc == InteractionLocality::Local)
         {
             cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+                    deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
             GMX_ASSERT(cl_error == CL_SUCCESS,
                        ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
 
@@ -551,13 +551,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
              * in the local stream in order to be able to sync with the above event
              * from the non-local stream.
              */
-            cl_error = clFlush(stream);
+            cl_error = clFlush(deviceStream.stream());
             GMX_ASSERT(cl_error == CL_SUCCESS,
                        ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
         }
         else
         {
-            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+            sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
         }
     }
 }
@@ -583,11 +583,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  */
 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
 {
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_nbparam_t*    nbp    = nb->nbparam;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_nbparam_t*       nbp          = nb->nbparam;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -628,14 +628,14 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
     }
 
     /* kernel launch config */
 
     KernelLaunchConfig config;
     config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
     config.blockSize[0]     = c_clSize;
     config.blockSize[1]     = c_clSize;
     config.gridSize[0]      = plist->nsci;
@@ -686,7 +686,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
 
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
     }
 }
 
@@ -722,12 +722,12 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
  */
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_nbparam_t*    nbp     = nb->nbparam;
-    cl_plist_t*      plist   = nb->plist[iloc];
-    cl_timers_t*     t       = nb->timers;
-    cl_command_queue stream  = nb->stream[iloc];
-    bool             bDoTime = nb->bDoTime;
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_nbparam_t*       nbp          = nb->nbparam;
+    cl_plist_t*         plist        = nb->plist[iloc];
+    cl_timers_t*        t            = nb->timers;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    bool                bDoTime      = nb->bDoTime;
 
     if (plist->haveFreshList)
     {
@@ -781,7 +781,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     /* beginning of timed prune calculation section */
     if (bDoTime)
     {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
     }
 
     /* Kernel launch config:
@@ -795,7 +795,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     /* kernel launch config */
     KernelLaunchConfig config;
     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
+    config.stream           = deviceStream.stream();
     config.blockSize[0]     = c_clSize;
     config.blockSize[1]     = c_clSize;
     config.blockSize[2]     = num_threads_z;
@@ -840,7 +840,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
 
     if (bDoTime)
     {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
     }
 }
 
@@ -861,10 +861,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* determine interaction locality from atom locality */
     const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
 
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_timers_t*     t       = nb->timers;
-    bool             bDoTime = nb->bDoTime;
-    cl_command_queue stream  = nb->stream[iloc];
+    cl_atomdata_t*      adat         = nb->atdat;
+    cl_timers_t*        t            = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -886,24 +886,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->xf[aloc].nb_d2h.openTimingRegion(stream);
+        t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
        has been launched. */
     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
     {
-        sync_ocl_event(stream, &(nb->nonlocal_done));
+        sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
     }
 
     /* DtoH f */
     ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
                        adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
-                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream,
+                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
                        bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
 
     /* kick off work */
-    cl_error = clFlush(stream);
+    cl_error = clFlush(deviceStream.stream());
     GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 
     /* After the non-local D2H is launched the nonlocal_done event can be
@@ -912,7 +912,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
        data back first. */
     if (iloc == InteractionLocality::NonLocal)
     {
-        cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+        cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
         GMX_ASSERT(cl_error == CL_SUCCESS,
                    ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
         nb->bNonLocalStreamActive = CL_TRUE;
@@ -924,24 +924,25 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]),
-                               stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+                               SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
+                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream,
+            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
                                bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
 
-            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream,
+            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
                                bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
     }
 
     if (bDoTime)
     {
-        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+        t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
     }
 }
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index eb1234d512..fa37263a5b 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -487,7 +487,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
 
     cl_int           cl_error;
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
 
     size_t local_work_size[3]  = { 1, 1, 1 };
     size_t global_work_size[3] = { 1, 1, 1 };
@@ -606,10 +606,12 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
         queue_properties = 0;
     }
 
-    /* local/non-local GPU streams */
-    nb->stream[InteractionLocality::Local] =
+    cl_command_queue localStream =
             clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                  nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+    /* local/non-local GPU streams */
+    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
+
     if (CL_SUCCESS != cl_error)
     {
         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
@@ -620,9 +622,11 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        nb->stream[InteractionLocality::NonLocal] =
+        cl_command_queue nonLocalStream =
                 clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                      nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
+
         if (CL_SUCCESS != cl_error)
         {
             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -675,7 +679,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
     cl_int gmx_used_in_debug cl_error;
 
     cl_atomdata_t*   atomData = nb->atdat;
-    cl_command_queue ls       = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local].stream();
     cl_float         value    = 0.0F;
 
     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
@@ -697,7 +701,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 
     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
     cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->stream[InteractionLocality::Local]);
+    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
     GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 }
 
@@ -708,9 +712,9 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     // Timing accumulation should happen only if there was work to do
     // because getLastRangeTime() gets skipped with empty lists later
     // which leads to the counter not being reset.
-    bool             bDoTime = (nb->bDoTime && !h_plist->sci.empty());
-    cl_command_queue stream  = nb->stream[iloc];
-    cl_plist_t*      d_plist = nb->plist[iloc];
+    bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    cl_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
     {
@@ -730,7 +734,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
     if (bDoTime)
     {
-        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.pl_h2d.openTimingRegion(deviceStream);
         iTimers.didPairlistH2D = true;
     }
 
@@ -739,12 +743,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                            deviceContext);
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
                            deviceContext);
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
@@ -752,12 +756,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
                            &d_plist->excl_nalloc, deviceContext);
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        iTimers.pl_h2d.closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(deviceStream);
     }
 
     /* need to prune the pair list during the next step */
@@ -768,7 +772,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->stream[InteractionLocality::Local];
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -782,13 +786,13 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 //! This function is documented in the header file
 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 {
-    cl_int           cl_error;
-    int              nalloc, natoms;
-    bool             realloced;
-    bool             bDoTime = nb->bDoTime;
-    cl_timers_t*     timers  = nb->timers;
-    cl_atomdata_t*   d_atdat = nb->atdat;
-    cl_command_queue ls      = nb->stream[InteractionLocality::Local];
+    cl_int              cl_error;
+    int                 nalloc, natoms;
+    bool                realloced;
+    bool                bDoTime      = nb->bDoTime;
+    cl_timers_t*        timers       = nb->timers;
+    cl_atomdata_t*      d_atdat      = nb->atdat;
+    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -796,7 +800,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     if (bDoTime)
     {
         /* time async copy */
-        timers->atdat.openTimingRegion(ls);
+        timers->atdat.openTimingRegion(deviceStream);
     }
 
     /* need to reallocate if we have to copy more atoms than the amount of space
@@ -859,21 +863,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     if (useLjCombRule(nb->nbparam->vdwtype))
     {
         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2),
-                           ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
     else
     {
         ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int),
-                           ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
 
     if (bDoTime)
     {
-        timers->atdat.closeTimingRegion(ls);
+        timers->atdat.closeTimingRegion(deviceStream);
     }
 
     /* kick off the tasks enqueued above to ensure concurrency with the search */
-    cl_error = clFlush(ls);
+    cl_error = clFlush(deviceStream.stream());
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 }
@@ -996,14 +1000,6 @@ void gpu_free(NbnxmGpu* nb)
     pfree(nb->nbst.fshift);
     nb->nbst.fshift = nullptr;
 
-    /* Free command queues */
-    clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
-    nb->stream[InteractionLocality::Local] = nullptr;
-    if (nb->bUseTwoStreams)
-    {
-        clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
-        nb->stream[InteractionLocality::NonLocal] = nullptr;
-    }
     /* Free other events */
     if (nb->nonlocal_done)
     {
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 6e3a410161..a3583761fa 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -361,7 +361,7 @@ struct NbnxmGpu
     nb_staging_t nbst;
 
     //! local and non-local GPU queues
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */
-- 
2.22.0