From db2c0d2ea17687201da8a242d6c79c9cd7e0f7a9 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 24 Feb 2020 10:22:40 +0100
Subject: [PATCH] Make use of the DeviceStreamManager

Use the DeviceStreamManager throughout the code. The manager is
owned by the runner and created when GPU is active. The consumers
get the context and streams if needed.

TODOs:
1. Make builders and move the selection on whether the stream should
   be created there. The builders should take the manager and pass
   the context and the stream to the consumer. Builders should have
   the option to create a stream.
2. Makefile in ewald tests uses old infrastructure. Also, the device
   context management should be lifted from there and utilized in
   all the tests that can run on GPU hardware.

Refs #3316
Refs #3311

Change-Id: I0d08adbe1dee19c1890e55f0e0cf79cea97d39bd
---
 src/gromacs/domdec/domdec.cpp                 |  20 ++-
 src/gromacs/domdec/domdec.h                   |  21 ++-
 src/gromacs/ewald/pme.cpp                     |  36 ++---
 src/gromacs/ewald/pme.h                       |  37 +++---
 src/gromacs/ewald/pme_gpu.cpp                 |   9 --
 src/gromacs/ewald/pme_gpu_internal.cpp        |  80 ++++-------
 src/gromacs/ewald/pme_gpu_internal.h          |  34 ++---
 src/gromacs/ewald/pme_gpu_types_host.h        |   3 -
 src/gromacs/ewald/pme_gpu_types_host_impl.h   |  11 +-
 src/gromacs/ewald/pme_only.cpp                |  39 +++---
 src/gromacs/ewald/pme_only.h                  |  21 +--
 src/gromacs/ewald/pme_pp_comm_gpu.h           |   6 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp    |   5 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |  18 ++-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |  10 +-
 src/gromacs/ewald/tests/CMakeLists.txt        |  11 +-
 src/gromacs/ewald/tests/pmegathertest.cpp     |  14 +-
 src/gromacs/ewald/tests/pmesolvetest.cpp      |  12 +-
 .../ewald/tests/pmesplinespreadtest.cpp       |  23 ++--
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  50 +++----
 src/gromacs/ewald/tests/pmetestcommon.h       |  37 +++---
 .../ewald/tests/testhardwarecontext.cpp       | 124 ++++++++++++++++++
 src/gromacs/ewald/tests/testhardwarecontext.h | 112 ++++++++++++++++
 .../ewald/tests/testhardwarecontexts.cpp      |  19 +--
 .../ewald/tests/testhardwarecontexts.h        |  72 +---------
 src/gromacs/gpu_utils/device_stream.cpp       |   2 +-
 .../gpu_utils/device_stream_manager.cpp       |  18 +++
 src/gromacs/gpu_utils/device_stream_manager.h |   6 +
 .../gpu_utils/tests/device_stream_manager.cpp |   1 -
 src/gromacs/listed_forces/gpubonded_impl.cu   |   6 +-
 .../mdlib/update_constrain_gpu_impl.cu        |   5 +-
 src/gromacs/mdrun/md.cpp                      |  39 +++---
 src/gromacs/mdrun/runner.cpp                  | 124 +++++++++++-------
 src/gromacs/mdtypes/forcerec.h                |   3 +
 .../mdtypes/state_propagator_data_gpu.h       |  35 ++---
 .../state_propagator_data_gpu_impl.cpp        |   5 +-
 .../mdtypes/state_propagator_data_gpu_impl.h  |  37 +-----
 .../state_propagator_data_gpu_impl_gpu.cpp    |  89 ++++---------
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  14 +-
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  49 ++++---
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |   2 +-
 src/gromacs/nbnxm/gpu_common.h                |   4 +-
 src/gromacs/nbnxm/gpu_data_mgmt.h             |  12 +-
 src/gromacs/nbnxm/nbnxm.h                     |  22 ++--
 src/gromacs/nbnxm/nbnxm_setup.cpp             |  40 +++---
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp        |   8 +-
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      |  39 +++---
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h    |   2 +-
 48 files changed, 769 insertions(+), 617 deletions(-)
 create mode 100644 src/gromacs/ewald/tests/testhardwarecontext.cpp
 create mode 100644 src/gromacs/ewald/tests/testhardwarecontext.h

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index ebcc92bf2e..aed2af87cb 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -64,6 +64,7 @@
 #include "gromacs/domdec/partition.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/hw_info.h"
 #include "gromacs/listed_forces/manage_threading.h"
@@ -3200,13 +3201,16 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
     return bCutoffAllowed;
 }
 
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal)
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager)
 {
-
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be valid when using"
+                       "GPU halo exchange.");
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                       "Non-local non-bonded stream should be valid when using "
+                       "GPU halo exchange.");
     int gpuHaloExchangeSize = 0;
     int pulseStart          = 0;
     if (cr.dd->gpuHaloExchange.empty())
@@ -3228,7 +3232,9 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
         for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
         {
             cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd, cr.mpi_comm_mysim, deviceContext, streamLocal, streamNonLocal, pulse));
+                    cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
         }
     }
 }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index da617607a6..82cefa18c1 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -84,13 +84,13 @@ struct t_mdatoms;
 struct t_nrnb;
 struct gmx_wallcycle;
 enum class PbcType : int;
-class DeviceStream;
 class t_state;
 class DeviceContext;
 class GpuEventSynchronizer;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class ForceWithShiftForces;
 class MDLogger;
 class RangePartitioning;
@@ -313,18 +313,15 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
                            real*                r_2b,
                            real*                r_mb);
 
-/*! \brief Construct the GPU halo exchange object(s)
- * \param[in] mdlog          The logger object
- * \param[in] cr             The commrec object
- * \param[in] deviceContext  GPU device context
- * \param[in] streamLocal    The local GPU stream
- * \param[in] streamNonLocal The non-local GPU stream
+/*! \brief Construct the GPU halo exchange object(s).
+ *
+ * \param[in] mdlog               The logger object.
+ * \param[in] cr                  The commrec object.
+ * \param[in] deviceStreamManager Manager of the GPU context and streams.
  */
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal);
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager);
 
 /*! \brief
  * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index 8d8bb673c2..120887e8bb 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -560,19 +560,20 @@ static int div_round_up(int enumerator, int denominator)
     return (enumerator + denominator - 1) / denominator;
 }
 
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
                         const gmx::MDLogger& /*mdlog*/)
 {
     int  use_threads, sum_use_threads, i;
@@ -883,8 +884,13 @@ gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
         {
             GMX_THROW(gmx::NotImplementedError(errorString));
         }
+        pme_gpu_reinit(pme.get(), deviceContext, deviceStream, pmeGpuProgram);
     }
-    pme_gpu_reinit(pme.get(), deviceInfo, pmeGpuProgram);
+    else
+    {
+        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object when PME is on a CPU.");
+    }
+
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
@@ -925,7 +931,7 @@ void gmx_pme_reinit(struct gmx_pme_t** pmedata,
         NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
         *pmedata = gmx_pme_init(cr, numPmeDomains, &irc, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE,
                                 ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread, pme_src->runMode,
-                                pme_src->gpu, nullptr, nullptr, dummyLogger);
+                                pme_src->gpu, nullptr, nullptr, nullptr, dummyLogger);
         /* When running PME on the CPU not using domain decomposition,
          * the atom data is allocated once only in gmx_pme_(re)init().
          */
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index bfc79b88e9..59aa7c604d 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -64,7 +64,6 @@ struct t_inputrec;
 struct t_nrnb;
 struct PmeGpu;
 struct gmx_wallclock_gpu_pme_t;
-struct DeviceInformation;
 struct gmx_enerdata_t;
 struct gmx_mtop_t;
 struct gmx_pme_t;
@@ -137,20 +136,21 @@ bool gmx_pme_check_restrictions(int  pme_order,
  * related things whose lifetime can/should exceed that of a task (or
  * perhaps task manager). See Redmine #2522.
  */
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
-                        const gmx::MDLogger&     mdlog);
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
+                        const gmx::MDLogger& mdlog);
 
 /*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from
  * pme_src. This is only called when the PME cut-off/grid size changes.
@@ -433,13 +433,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
 GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Returns the pointer to the GPU stream.
- *  \param[in] pme            The PME data structure.
- *  \returns                  Pointer to GPU stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to sychronizer
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 91596e77d4..a5f54f004d 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -433,15 +433,6 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
     pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
 }
 
-const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
-{
-    if (!pme || !pme_gpu_active(pme))
-    {
-        return nullptr;
-    }
-    return pme_gpu_get_stream(pme->gpu);
-}
-
 GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index c7a6df563a..62f7cc5c7e 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -56,6 +56,8 @@
 #include <string>
 
 #include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/units.h"
@@ -462,16 +464,22 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
     pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
 }
 
-void pme_gpu_init_internal(PmeGpu* pmeGpu)
+/*! \brief Internal GPU initialization for PME.
+ *
+ * \param[in]  pmeGpu         GPU PME data.
+ * \param[in]  deviceContext  GPU context.
+ * \param[in]  deviceStream   GPU stream.
+ */
+static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceContext, const DeviceStream& deviceStream)
 {
 #if GMX_GPU == GMX_GPU_CUDA
     // Prepare to use the device that this PME task was assigned earlier.
     // Other entities, such as CUDA timing events, are known to implicitly use the device context.
-    CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+    CU_RET_ERR(cudaSetDevice(deviceContext.deviceInfo().id), "Switching to PME CUDA device");
 #endif
 
     /* Allocate the target-specific structures */
-    pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
+    pmeGpu->archSpecific.reset(new PmeGpuSpecific(deviceContext, deviceStream));
     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
 
     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
@@ -480,33 +488,12 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
      */
 
-    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
-    if (GMX_GPU == GMX_GPU_CUDA)
-    {
-        /* WARNING: CUDA timings are incorrect with multiple streams.
-         *          This is the main reason why they are disabled by default.
-         */
-        // TODO: Consider turning on by default when we can detect nr of streams.
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-    }
-    else if (GMX_GPU == GMX_GPU_OPENCL)
-    {
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-    }
-
 #if GMX_GPU == GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
 #elif GMX_GPU == GMX_GPU_OPENCL
     pmeGpu->maxGridWidthX = INT32_MAX / 2;
     // TODO: is there no really global work size limit in OpenCL?
 #endif
-
-    /* Creating a PME GPU stream:
-     * - default high priority with CUDA
-     * - no priorities implemented yet with OpenCL; see #2532
-     */
-    pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
-                                          DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
 }
 
 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
@@ -721,13 +708,15 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
  * TODO: this should become PmeGpu::PmeGpu()
  *
  * \param[in,out] pme            The PME structure.
- * \param[in,out] deviceInfo     The GPU device information structure.
- * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
  */
-static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t*           pme,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream,
+                         const PmeGpuProgram* pmeGpuProgram)
 {
-    GMX_ASSERT(deviceInfo != nullptr,
-               "Device information can not be nullptr when GPU is used for PME.");
     pme->gpu       = new PmeGpu();
     PmeGpu* pmeGpu = pme->gpu;
     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
@@ -743,13 +732,12 @@ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, co
 
     pme_gpu_set_testing(pmeGpu, false);
 
-    pmeGpu->deviceInfo = deviceInfo;
     GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
     pmeGpu->programHandle_ = pmeGpuProgram;
 
     pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
 
-    pme_gpu_init_internal(pmeGpu);
+    pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
     pme_gpu_alloc_energy_virial(pmeGpu);
 
     pme_gpu_copy_common_data_from(pme);
@@ -773,19 +761,21 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t*           pme,
+                    const DeviceContext* deviceContext,
+                    const DeviceStream*  deviceStream,
+                    const PmeGpuProgram* pmeGpuProgram)
 {
     GMX_ASSERT(pme != nullptr, "Need valid PME object");
-    if (pme->runMode == PmeRunMode::CPU)
-    {
-        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
-        return;
-    }
 
     if (!pme->gpu)
     {
+        GMX_RELEASE_ASSERT(deviceContext != nullptr,
+                           "Device context can not be nullptr when setting up PME on GPU.");
+        GMX_RELEASE_ASSERT(deviceStream != nullptr,
+                           "Device stream can not be nullptr when setting up PME on GPU.");
         /* First-time initialization */
-        pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
+        pme_gpu_init(pme, *deviceContext, *deviceStream, pmeGpuProgram);
     }
     else
     {
@@ -1358,18 +1348,6 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx:
     pmeGpu->kernelParams->atoms.d_coordinates = d_x;
 }
 
-const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
-{
-    if (pmeGpu)
-    {
-        return &pmeGpu->archSpecific->pmeStream_;
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index 3d764fd468..9a15c3bbc9 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -54,8 +54,10 @@
 #include "pme_gpu_types_host.h"
 #include "pme_output.h"
 
-class GpuEventSynchronizer;
+class DeviceContext;
 struct DeviceInformation;
+class DeviceStream;
+class GpuEventSynchronizer;
 struct gmx_hw_info_t;
 struct gmx_gpu_opt_t;
 struct gmx_pme_t; // only used in pme_gpu_reinit
@@ -71,7 +73,7 @@ struct t_complex;
 namespace gmx
 {
 class MDLogger;
-}
+} // namespace gmx
 
 //! Type of spline data
 enum class PmeSplineDataType
@@ -299,14 +301,6 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
  */
 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
 
-/*! \libinternal \brief
- * Does the one-time GPU-framework specific PME initialization.
- * For CUDA, the PME stream is created with the highest priority.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_init_internal(PmeGpu* pmeGpu);
-
 /*! \libinternal \brief
  * Initializes the CUDA FFT structures.
  *
@@ -387,13 +381,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
 GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Return pointer to GPU stream.
- * \param[in] pmeGpu         The PME GPU structure.
- * \returns                  Pointer to stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to sync object
@@ -498,13 +485,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUM
 /*! \libinternal \brief
  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
  *
- * \param[in,out] pme             The PME structure.
- * \param[in]     deviceInfo      The GPU device information structure.
- * \param[in]     pmeGpuProgram   The PME GPU program data
+ * \param[in,out] pme            The PME structure.
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ *
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
-                                       const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*           GPU_FUNC_ARGUMENT(pme),
+                                       const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
+                                       const DeviceStream*  GPU_FUNC_ARGUMENT(deviceStream),
                                        const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h
index 481c3a302d..9d7e2f78f9 100644
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -162,9 +162,6 @@ struct PmeGpu
      */
     int nAtomsAlloc;
 
-    /*! \brief A pointer to the device used during the execution. */
-    const DeviceInformation* deviceInfo;
-
     /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
      * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
      * OpenCL seems to not have readily available global work size limit, so we just assign a large arbitrary constant to this instead.
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index a019a7c031..e134d2c0a7 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -71,9 +71,14 @@ struct PmeGpuSpecific
 {
     /*! \brief Constructor
      *
-     * \param[in] deviceContext GPU device context.
+     * \param[in] deviceContext  GPU device context
+     * \param[in] pmeStream      GPU pme stream.
      */
-    PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
+    PmeGpuSpecific(const DeviceContext& deviceContext, const DeviceStream& pmeStream) :
+        deviceContext_(deviceContext),
+        pmeStream_(pmeStream)
+    {
+    }
 
     /*! \brief
      * A handle to the GPU context.
@@ -84,7 +89,7 @@ struct PmeGpuSpecific
     const DeviceContext& deviceContext_;
 
     /*! \brief The GPU stream where everything related to the PME happens. */
-    DeviceStream pmeStream_;
+    const DeviceStream& pmeStream_;
 
     /* Synchronization events */
     /*! \brief Triggered after the PME Force Calculations have been completed */
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index b85629b28a..26d824e3a2 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -82,6 +82,7 @@
 #include "gromacs/fileio/pdbio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/gmxcomplex.h"
 #include "gromacs/math/units.h"
@@ -597,14 +598,14 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
 #endif
 }
 
-int gmx_pmeonly(struct gmx_pme_t*         pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext)
+int gmx_pmeonly(struct gmx_pme_t*               pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager)
 {
     int     ret;
     int     natoms = 0;
@@ -629,25 +630,27 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
     const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
     if (useGpuForPme)
     {
-        const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
-
+        GMX_RELEASE_ASSERT(
+                deviceStreamManager != nullptr,
+                "Device stream manager can not be nullptr when using GPU in PME-only rank.");
+        GMX_RELEASE_ASSERT(deviceStreamManager->streamIsValid(gmx::DeviceStreamType::Pme),
+                           "Device stream can not be nullptr when using GPU in PME-only rank");
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
         if (c_enableGpuPmePpComms)
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
         }
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building GPU propagator data object.");
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
-                                                                 GpuApiCallBehavior::Async,
-                                                                 pme_gpu_get_block_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
+                &deviceStreamManager->stream(gmx::DeviceStreamType::Pme), deviceStreamManager->context(),
+                GpuApiCallBehavior::Async, pme_gpu_get_block_size(pme), wcycle);
     }
 
     clear_nrnb(mynrnb);
diff --git a/src/gromacs/ewald/pme_only.h b/src/gromacs/ewald/pme_only.h
index 18edbb9b43..1a71ea195c 100644
--- a/src/gromacs/ewald/pme_only.h
+++ b/src/gromacs/ewald/pme_only.h
@@ -55,17 +55,20 @@ struct t_nrnb;
 struct gmx_pme_t;
 struct gmx_wallcycle;
 
-class DeviceContext;
 enum class PmeRunMode;
+namespace gmx
+{
+class DeviceStreamManager;
+}
 
 /*! \brief Called on the nodes that do PME exclusively */
-int gmx_pmeonly(gmx_pme_t*                pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext);
+int gmx_pmeonly(gmx_pme_t*                      pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager);
 
 #endif
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h
index ea750cc17c..97accca871 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -46,11 +46,14 @@
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
+class DeviceStream;
 class GpuEventSynchronizer;
 
 namespace gmx
 {
 
+class DeviceStreamManager;
+
 /*! \libinternal
 
  * \brief Manages communication related to GPU buffers between this
@@ -63,8 +66,9 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
      * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
      */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
     ~PmePpCommGpu();
 
     /*! \brief Perform steps required when buffer size changes
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
index b8befc5311..b53ce94ada 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -62,7 +62,10 @@ class PmePpCommGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, const DeviceContext& /* deviceContext */) :
+PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
+                           int /* pmeRank */,
+                           const DeviceContext& /* deviceContext */,
+                           const DeviceStream& /* deviceStream */) :
     impl_(nullptr)
 {
     GMX_ASSERT(false,
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 0e78978865..2c6f696ddd 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -49,6 +49,7 @@
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/gmxmpi.h"
@@ -56,18 +57,18 @@
 namespace gmx
 {
 
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
+                         int                  pmeRank,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream) :
     deviceContext_(deviceContext),
+    pmePpCommStream_(deviceStream),
     comm_(comm),
     pmeRank_(pmeRank)
 {
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
             "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-
-    // In CUDA we only need priority to create stream.
-    // (note that this will be moved from here in the follow-up patch)
-    pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
@@ -158,8 +159,11 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
     return static_cast<void*>(&forcesReadySynchronizer_);
 }
 
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
-    impl_(new Impl(comm, pmeRank, deviceContext))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
+                           int                  pmeRank,
+                           const DeviceContext& deviceContext,
+                           const DeviceStream&  deviceStream) :
+    impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
 {
 }
 
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 934b7c40c6..4c95d9bccd 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -57,11 +57,13 @@ class PmePpCommGpu::Impl
 
 public:
     /*! \brief Creates PME-PP GPU communication object.
+     *
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
      * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
      */
-    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
     ~Impl();
 
     /*! \brief Perform steps required when buffer size changes
@@ -116,10 +118,10 @@ public:
     void* getForcesReadySynchronizer();
 
 private:
-    //! Device context object
+    //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
-    //! CUDA stream used for the communication operations in this class
-    DeviceStream pmePpCommStream_;
+    //! Handle for CUDA stream used for the communication operations in this class
+    const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
     void* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
diff --git a/src/gromacs/ewald/tests/CMakeLists.txt b/src/gromacs/ewald/tests/CMakeLists.txt
index 6dea2b21ec..170bc96c12 100644
--- a/src/gromacs/ewald/tests/CMakeLists.txt
+++ b/src/gromacs/ewald/tests/CMakeLists.txt
@@ -41,4 +41,13 @@ gmx_add_unit_test(EwaldUnitTests ewald-test HARDWARE_DETECTION
         testhardwarecontexts.cpp
     GPU_CPP_SOURCE_FILES
         pmetestcommon.cpp
-        )
+)
+
+gmx_add_libgromacs_sources(
+    testhardwarecontext.cpp
+)
+if (GMX_USE_CUDA)
+gmx_compile_cpp_as_cuda(
+    testhardwarecontext.cpp
+)
+endif()
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp
index 51035f0355..eae6448323 100644
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -280,13 +280,13 @@ public:
         TestReferenceData refData;
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                  NotImplementedError);
                 continue;
             }
@@ -295,14 +295,16 @@ public:
             SCOPED_TRACE(
                     formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
                                  ", order %d, %zu atoms",
-                                 codePathToString(codePath), context->getDescription().c_str(),
+                                 codePathToString(codePath), context->description().c_str(),
                                  gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
 
-            PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                    context->getPmeGpuProgram(), box);
+            PmeSafePointer pmeSafe =
+                    pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                   context->deviceStream(), context->pmeGpuProgram(), box);
             std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                     (codePath == CodePath::GPU)
-                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                         context->deviceStream())
                             : nullptr;
 
             pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates,
diff --git a/src/gromacs/ewald/tests/pmesolvetest.cpp b/src/gromacs/ewald/tests/pmesolvetest.cpp
index 86688822eb..07f31629f6 100644
--- a/src/gromacs/ewald/tests/pmesolvetest.cpp
+++ b/src/gromacs/ewald/tests/pmesolvetest.cpp
@@ -109,13 +109,13 @@ public:
         TestReferenceData refData;
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box,
+                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, nullptr, box,
                                               ewaldCoeff_q, ewaldCoeff_lj),
                                  NotImplementedError);
                 continue;
@@ -137,13 +137,13 @@ public:
                             "size %d %d %d, Ewald coefficients %g %g",
                             (method == PmeSolveAlgorithm::LennardJones) ? "Lennard-Jones" : "Coulomb",
                             gridOrdering.second.c_str(), computeEnergyAndVirial ? "with" : "without",
-                            codePathToString(codePath), context->getDescription().c_str(),
+                            codePathToString(codePath), context->description().c_str(),
                             gridSize[XX], gridSize[YY], gridSize[ZZ], ewaldCoeff_q, ewaldCoeff_lj));
 
                     /* Running the test */
-                    PmeSafePointer pmeSafe =
-                            pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
-                                         context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
+                    PmeSafePointer pmeSafe = pmeInitEmpty(
+                            &inputRec, codePath, context->deviceContext(), context->deviceStream(),
+                            context->pmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
                     pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
                     const real cellVolume = box[0] * box[4] * box[8];
                     // FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
index 5c2d8663ef..ef975b2d08 100644
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -126,13 +126,13 @@ public:
 
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                  NotImplementedError);
                 continue;
             }
@@ -141,20 +141,21 @@ public:
             {
                 /* Describing the test uniquely in case it fails */
 
-                SCOPED_TRACE(
-                        formatString("Testing %s with %s %sfor PME grid size %d %d %d"
-                                     ", order %d, %zu atoms",
-                                     option.second.c_str(), codePathToString(codePath),
-                                     context->getDescription().c_str(), gridSize[XX], gridSize[YY],
-                                     gridSize[ZZ], pmeOrder, atomCount));
+                SCOPED_TRACE(formatString(
+                        "Testing %s with %s %sfor PME grid size %d %d %d"
+                        ", order %d, %zu atoms",
+                        option.second.c_str(), codePathToString(codePath), context->description().c_str(),
+                        gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
 
                 /* Running the test */
 
-                PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                        context->getPmeGpuProgram(), box);
+                PmeSafePointer pmeSafe =
+                        pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                       context->deviceStream(), context->pmeGpuProgram(), box);
                 std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                         (codePath == CodePath::GPU)
-                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                             context->deviceStream())
                                 : nullptr;
 
                 pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index eaf697e1d5..888cac5873 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/ewald/pme_solve.h"
 #include "gromacs/ewald/pme_spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/commrec.h"
@@ -106,21 +107,22 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
 }
 
 //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              const CodePath           mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              const real               ewaldCoeff_q,
-                              const real               ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              const CodePath       mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              const real           ewaldCoeff_q,
+                              const real           ewaldCoeff_lj)
 {
     const MDLogger dummyLogger;
     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
     t_commrec      dummyCommrec  = { 0 };
     NumPmeDomains  numPmeDomains = { 1, 1 };
-    gmx_pme_t*     pmeDataRaw =
-            gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
-                         ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+    gmx_pme_t* pmeDataRaw = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true,
+                                         ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr,
+                                         deviceContext, deviceStream, pmeGpuProgram, dummyLogger);
     PmeSafePointer pme(pmeDataRaw); // taking ownership
 
     // TODO get rid of this with proper matrix type
@@ -151,33 +153,35 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
 }
 
 //! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            const CodePath           mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            const real               ewaldCoeff_q,
-                            const real               ewaldCoeff_lj)
-{
-    return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            const CodePath       mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            const real           ewaldCoeff_q,
+                            const real           ewaldCoeff_lj)
+{
+    return pmeInitWrapper(inputRec, mode, deviceContext, deviceStream, pmeGpuProgram, box,
+                          ewaldCoeff_q, ewaldCoeff_lj);
     // hiding the fact that PME actually needs to know the number of atoms in advance
 }
 
 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
 {
     const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
-    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
 }
 
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext)
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream)
 {
     // TODO: Pin the host buffer and use async memory copies
     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
     //       restrict one from using other constructor here.
-    return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
-                                                    GpuApiCallBehavior::Sync,
+    return std::make_unique<StatePropagatorDataGpu>(deviceStream, *deviceContext, GpuApiCallBehavior::Sync,
                                                     pme_gpu_get_block_size(&pme), nullptr);
 }
 
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h
index 98a2bbd4d2..7f2e727c5f 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -55,6 +55,8 @@
 
 namespace gmx
 {
+
+class DeviceStreamManager;
 namespace test
 {
 
@@ -118,26 +120,31 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
 // PME stages
 
 //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              CodePath                 mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              real                     ewaldCoeff_q  = 1.0F,
-                              real                     ewaldCoeff_lj = 1.0F);
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              CodePath             mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              real                 ewaldCoeff_q  = 1.0F,
+                              real                 ewaldCoeff_lj = 1.0F);
 //! Simple PME initialization (no atom data)
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            CodePath                 mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            real                     ewaldCoeff_q,
-                            real                     ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            CodePath             mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            real                 ewaldCoeff_q,
+                            real                 ewaldCoeff_lj);
+
 //! Simple PME initialization based on inputrec only
 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
+
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext);
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream);
 //! PME initialization with atom data and system box
 void pmeInitAtoms(gmx_pme_t*               pme,
                   StatePropagatorDataGpu*  stateGpu,
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.cpp b/src/gromacs/ewald/tests/testhardwarecontext.cpp
new file mode 100644
index 0000000000..6e2c455efa
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontext.cpp
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontext.h"
+
+#include <memory>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+namespace gmx
+{
+namespace test
+{
+
+TestHardwareContext::TestHardwareContext(CodePath codePath, const char* description) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
+                       "A GPU code path should provide DeviceInformation to the "
+                       "TestHerdwareContext constructor.");
+    deviceContext_ = nullptr;
+    deviceStream_  = nullptr;
+}
+
+TestHardwareContext::TestHardwareContext(CodePath                 codePath,
+                                         const char*              description,
+                                         const DeviceInformation& deviceInfo) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
+                       "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
+                       "in CPU build.");
+    deviceContext_ = new DeviceContext(deviceInfo);
+    deviceStream_  = new DeviceStream(*deviceContext_, DeviceStreamPriority::Normal, false);
+    program_       = buildPmeGpuProgram(*deviceContext_);
+}
+
+TestHardwareContext::~TestHardwareContext()
+{
+    delete (deviceStream_);
+    delete (deviceContext_);
+}
+
+const DeviceInformation* TestHardwareContext::deviceInfo() const
+{
+    return &deviceContext_->deviceInfo();
+}
+
+const DeviceContext* TestHardwareContext::deviceContext() const
+{
+    return deviceContext_;
+}
+//! Get the device stream
+const DeviceStream* TestHardwareContext::deviceStream() const
+{
+    return deviceStream_;
+}
+
+const char* codePathToString(CodePath codePath)
+{
+    switch (codePath)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU";
+        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+    }
+}
+
+} // namespace test
+} // namespace gmx
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.h b/src/gromacs/ewald/tests/testhardwarecontext.h
new file mode 100644
index 0000000000..fa5ebd9da8
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontext.h
@@ -0,0 +1,112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+struct DeviceInformation;
+class DeviceStream;
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+    CPU,
+    GPU
+};
+
+//! Return a string useful for human-readable messages describing a \c codePath.
+const char* codePathToString(CodePath codePath);
+
+/*! \internal \brief
+ * A structure to describe a hardware context  that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
+ */
+struct TestHardwareContext
+{
+    //! Hardware path for the code being tested.
+    CodePath codePath_;
+    //! Readable description
+    std::string description_;
+    //! Device context
+    DeviceContext* deviceContext_ = nullptr;
+    //! Device stream
+    DeviceStream* deviceStream_ = nullptr;
+    //! Persistent compiled GPU kernels for PME.
+    PmeGpuProgramStorage program_;
+
+public:
+    //! Retuns the code path for this context.
+    CodePath codePath() const { return codePath_; }
+    //! Returns a human-readable context description line
+    std::string description() const { return description_; }
+    //! Returns the device info pointer
+    const DeviceInformation* deviceInfo() const;
+    //! Get the device context
+    const DeviceContext* deviceContext() const;
+    //! Get the device stream
+    const DeviceStream* deviceStream() const;
+    //! Returns the persistent PME GPU kernels
+    const PmeGpuProgram* pmeGpuProgram() const { return program_.get(); }
+    //! Constructs the context for CPU builds
+    TestHardwareContext(CodePath codePath, const char* description);
+    //! Constructs the context for GPU builds
+    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo);
+    //! Destructor
+    ~TestHardwareContext();
+};
+
+} // namespace test
+} // namespace gmx
+#endif
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
index 661f0fa4bb..96f36f9810 100644
--- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -60,18 +60,6 @@ namespace gmx
 namespace test
 {
 
-TestHardwareContext::~TestHardwareContext() = default;
-
-const char* codePathToString(CodePath codePath)
-{
-    switch (codePath)
-    {
-        case CodePath::CPU: return "CPU";
-        case CodePath::GPU: return "GPU";
-        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
-    }
-}
-
 /* Implements the "construct on first use" idiom to avoid any static
  * initialization order fiasco.
  *
@@ -120,8 +108,6 @@ void PmeTestEnvironment::SetUp()
     for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
     {
         const DeviceInformation* deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
-        GMX_RELEASE_ASSERT(deviceInfo != nullptr,
-                           "Device information should be provided for the GPU builds.");
         init_gpu(deviceInfo);
 
         char stmp[200] = {};
@@ -132,5 +118,10 @@ void PmeTestEnvironment::SetUp()
     }
 }
 
+void PmeTestEnvironment::TearDown()
+{
+    hardwareContexts_.clear();
+}
+
 } // namespace test
 } // namespace gmx
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h
index 0af2343795..6a1450fe79 100644
--- a/src/gromacs/ewald/tests/testhardwarecontexts.h
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.h
@@ -49,83 +49,17 @@
 #include <gtest/gtest.h>
 
 #include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/gmxassert.h"
 
+#include "testhardwarecontext.h"
+
 struct gmx_hw_info_t;
 
 namespace gmx
 {
 namespace test
 {
-//! Hardware code path being tested
-enum class CodePath
-{
-    CPU,
-    GPU
-};
-
-//! Return a string useful for human-readable messages describing a \c codePath.
-const char* codePathToString(CodePath codePath);
-
-/*! \internal \brief
- * A structure to describe a hardware context  that persists over the lifetime
- * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
- */
-struct TestHardwareContext
-{
-    //! Hardware path for the code being tested.
-    CodePath codePath_;
-    //! Readable description
-    std::string description_;
-    //! Device information pointer
-    const DeviceInformation* deviceInfo_;
-    //! Local copy of the device context pointer
-    std::unique_ptr<DeviceContext> deviceContext_;
-    //! Persistent compiled GPU kernels for PME.
-    PmeGpuProgramStorage program_;
-
-public:
-    //! Retuns the code path for this context.
-    CodePath getCodePath() const { return codePath_; }
-    //! Returns a human-readable context description line
-    std::string getDescription() const { return description_; }
-    //! Getter for the DeviceContext
-    const DeviceContext& deviceContext() const
-    {
-        GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
-                           "Trying to get device context before it was initialized or in builds "
-                           "without GPU support.");
-        return *deviceContext_;
-    }
-    //! Returns the device info pointer
-    const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
-    //! Returns the persistent PME GPU kernels
-    const PmeGpuProgram* getPmeGpuProgram() const { return program_.get(); }
-    //! Constructs the context for CPU builds
-    TestHardwareContext(CodePath codePath, const char* description) :
-        codePath_(codePath),
-        description_(description)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
-                           "A GPU code path should provide DeviceInformation to the "
-                           "TestHardwareContext constructor.");
-    }
-    //! Constructs the context for GPU builds
-    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
-        codePath_(codePath),
-        description_(description),
-        deviceInfo_(&deviceInfo)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
-                           "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
-                           "in CPU build.");
-        deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
-        program_       = buildPmeGpuProgram(*deviceContext_);
-    }
-    ~TestHardwareContext();
-};
 
 //! A container of handles to hardware contexts
 typedef std::vector<std::unique_ptr<TestHardwareContext>> TestHardwareContexts;
@@ -144,6 +78,8 @@ private:
 public:
     //! This is called by GTest framework once to query the hardware
     void SetUp() override;
+    //! This is called by GTest framework once release the hardware
+    void TearDown() override;
     //! Get available hardware contexts.
     const TestHardwareContexts& getHardwareContexts() const { return hardwareContexts_; }
     //! Get available hardware information.
diff --git a/src/gromacs/gpu_utils/device_stream.cpp b/src/gromacs/gpu_utils/device_stream.cpp
index 7f05de6b32..bfbe049235 100644
--- a/src/gromacs/gpu_utils/device_stream.cpp
+++ b/src/gromacs/gpu_utils/device_stream.cpp
@@ -59,4 +59,4 @@ bool DeviceStream::isValid() const
     return false;
 }
 
-void DeviceStream::synchronize() const {}
+void DeviceStream::synchronize() const {};
diff --git a/src/gromacs/gpu_utils/device_stream_manager.cpp b/src/gromacs/gpu_utils/device_stream_manager.cpp
index 1c8228e590..8c7457a3d3 100644
--- a/src/gromacs/gpu_utils/device_stream_manager.cpp
+++ b/src/gromacs/gpu_utils/device_stream_manager.cpp
@@ -156,6 +156,24 @@ const DeviceStream& DeviceStreamManager::stream(DeviceStreamType streamToGet) co
     return impl_->streams_[streamToGet];
 }
 
+const DeviceStream& DeviceStreamManager::bondedStream(bool hasPPDomainDecomposition) const
+{
+    if (hasPPDomainDecomposition)
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedNonLocal).isValid(),
+                           "GPU non-bonded non-local stream should be valid in order to use GPU "
+                           "version of bonded forces with domain decomposition.");
+        return stream(DeviceStreamType::NonBondedNonLocal);
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedLocal).isValid(),
+                           "GPU non-bonded local stream should be valid in order to use GPU "
+                           "version of bonded forces without domain decomposition.");
+        return stream(DeviceStreamType::NonBondedLocal);
+    }
+}
+
 bool DeviceStreamManager::streamIsValid(DeviceStreamType streamToCheck) const
 {
     return impl_->streams_[streamToCheck].isValid();
diff --git a/src/gromacs/gpu_utils/device_stream_manager.h b/src/gromacs/gpu_utils/device_stream_manager.h
index 4cfa6161a4..4565d1ac0c 100644
--- a/src/gromacs/gpu_utils/device_stream_manager.h
+++ b/src/gromacs/gpu_utils/device_stream_manager.h
@@ -124,6 +124,12 @@ public:
      */
     const DeviceStream& stream(DeviceStreamType streamToGet) const;
 
+    /*! \brief Returns a handle to the GPU stream to compute bonded forces in.
+     *
+     * \param[in] hasPPDomainDecomposition Whether there is a particle-particle domain decomposition.
+     */
+    const DeviceStream& bondedStream(bool hasPPDomainDecomposition) const;
+
     /*! \brief Return whether the requested GPU stream is valid for use.
      *
      * \param[in] streamToCheck Which stream to check.
diff --git a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp
index 1c0330e02a..e75c80bc07 100644
--- a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp
+++ b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp
@@ -171,7 +171,6 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
                                            DeviceStreamType::UpdateAndConstraints });
-            expectInvalidStreams(&manager, {});
         }
 
         {
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index 0d5367f698..faa775a0f7 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -51,6 +51,7 @@
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/mdtypes/enerdata.h"
@@ -71,6 +72,9 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
     deviceContext_(deviceContext),
     deviceStream_(deviceStream)
 {
+    GMX_RELEASE_ASSERT(deviceStream.isValid(),
+                       "Can't run GPU version of bonded forces in stream that is not valid.");
+
     wcycle_ = wcycle;
 
     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
@@ -81,7 +85,7 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
                        deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     vTot_.resize(F_NRE);
     allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
 
     kernelParams_.d_forceParams = d_forceParams_;
     kernelParams_.d_xq          = d_xq_;
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index 562c1be500..76899bbd82 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -58,6 +58,7 @@
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -121,8 +122,8 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
     // Make sure that the forces are ready on device before proceeding with the update.
     fReadyOnDevice->enqueueWaitEvent(deviceStream_);
 
-    // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
-    // into d_x_. The d_xp_ is only needed by constraints.
+    // The integrate should save a copy of the current coordinates in d_xp_ and write updated
+    // once into d_x_. The d_xp_ is only needed by constraints.
     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
                            doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
     // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 9d341e1eae..f7f39bbee7 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -67,6 +67,7 @@
 #include "gromacs/fileio/trxio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
 #include "gromacs/listed_forces/manage_threading.h"
@@ -353,6 +354,7 @@ void gmx::LegacySimulator::do_md()
 
     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 
+    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
     if (useGpuForUpdate)
     {
         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
@@ -397,14 +399,17 @@ void gmx::LegacySimulator::do_md()
         {
             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
         }
-
-        GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
-                           "GPU device context should be initialized to use GPU update.");
-        GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
-                           "Update stream can not be nullptr when update is on a GPU.");
-        integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
-                                                          *stateGpu->getUpdateStream(),
-                                                          stateGpu->xUpdatedOnDevice());
+        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                           "Device stream manager should be initialized in order to use GPU "
+                           "update-constraints.");
+        GMX_RELEASE_ASSERT(
+                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
+                "Update stream should be initialized in order to use GPU "
+                "update-constraints.");
+        integrator = std::make_unique<UpdateConstrainGpu>(
+                *ir, *top_global, fr->deviceStreamManager->context(),
+                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
+                stateGpu->xUpdatedOnDevice());
 
         integrator->setPbc(PbcType::Xyz, state->box);
     }
@@ -864,21 +869,11 @@ void gmx::LegacySimulator::do_md()
                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
                     && useGpuForNonbonded && is1D(*cr->dd))
                 {
+                    GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                                       "GPU device manager has to be initialized to use GPU "
+                                       "version of halo exchange.");
                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-                    const DeviceStream* localStream =
-                            Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-                    const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
-                            fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-                    GMX_RELEASE_ASSERT(
-                            fr->deviceContext != nullptr,
-                            "GPU device context should be initialized to use GPU halo exchange.");
-                    GMX_RELEASE_ASSERT(localStream != nullptr,
-                                       "Local non-bonded stream can't be nullptr when using GPU "
-                                       "halo exchange.");
-                    GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
-                                       "Non-local non-bonded stream can't be nullptr when using "
-                                       "GPU halo exchange.");
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
                 }
             }
         }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 604ff0ed7b..43ac7d6164 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -74,6 +74,7 @@
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/detecthardware.h"
@@ -1141,19 +1142,23 @@ int Mdrunner::mdrunner()
             EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
 
     // Get the device handles for the modules, nullptr when no task is assigned.
-    int                            deviceId      = -1;
-    DeviceInformation*             deviceInfo    = gpuTaskAssignments.initDevice(&deviceId);
-    std::unique_ptr<DeviceContext> deviceContext = nullptr;
-    if (deviceInfo != nullptr)
+    int                deviceId   = -1;
+    DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
+
+    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
+    bool useTiming = true;
+    if (GMX_GPU == GMX_GPU_CUDA)
     {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
-        {
-            dd_setup_dlb_resource_sharing(cr, deviceId);
-        }
-        deviceContext = std::make_unique<DeviceContext>(*deviceInfo);
+        /* WARNING: CUDA timings are incorrect with multiple streams.
+         *          This is the main reason why they are disabled by default.
+         */
+        // TODO: Consider turning on by default when we can detect nr of streams.
+        useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+    }
+    else if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
     }
-
-    // TODO Initialize GPU streams here.
 
     // TODO Currently this is always built, yet DD partition code
     // checks if it is built before using it. Probably it should
@@ -1190,6 +1195,19 @@ int Mdrunner::mdrunner()
     const bool printHostName = (cr->nnodes > 1);
     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
 
+    std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
+
+    if (deviceInfo != nullptr)
+    {
+        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+        {
+            dd_setup_dlb_resource_sharing(cr, deviceId);
+        }
+        deviceStreamManager = std::make_unique<DeviceStreamManager>(
+                *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
+                useGpuForUpdate, useTiming);
+    }
+
     // If the user chose a task assignment, give them some hints
     // where appropriate.
     if (!userGpuTaskAssignment.empty())
@@ -1348,32 +1366,36 @@ int Mdrunner::mdrunner()
                       opt2fn("-tablep", filenames.size(), filenames.data()),
                       opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
 
-        fr->deviceContext = deviceContext.get();
+        // Save a handle to device stream manager to use elsewhere in the code
+        // TODO: Forcerec is not a correct place to store it.
+        fr->deviceStreamManager = deviceStreamManager.get();
 
         if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
         {
             GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when PME-PP direct communications object.");
+                    deviceStreamManager != nullptr,
+                    "GPU device stream manager should be valid in order to use PME-PP direct "
+                    "communications.");
+            GMX_RELEASE_ASSERT(
+                    deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
+                    "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
+                    "communications.");
             fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
-                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, *deviceContext);
+                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
+                    deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
         }
 
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, deviceInfo,
-                                        fr->deviceContext, &mtop, box, wcycle);
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, useGpuForNonbonded,
+                                        deviceStreamManager.get(), &mtop, box, wcycle);
+        // TODO: Move the logic below to a GPU bonded builder
         if (useGpuForBonded)
         {
-            auto stream = havePPDomainDecomposition(cr)
-                                  ? Nbnxm::gpu_get_command_stream(
-                                            fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
-                                  : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
-                                                                  gmx::InteractionLocality::Local);
-            GMX_RELEASE_ASSERT(
-                    fr->deviceContext != nullptr,
-                    "Device context can not be nullptr when computing bonded interactions on GPU.");
-            GMX_RELEASE_ASSERT(stream != nullptr,
-                               "Can'r run GPU version of bonded forces in nullptr stream.");
-            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be valid in order to use GPU "
+                               "version of bonded forces.");
+            gpuBonded = std::make_unique<GpuBonded>(
+                    mtop.ffparams, deviceStreamManager->context(),
+                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
             fr->gpuBonded = gpuBonded.get();
         }
 
@@ -1450,9 +1472,11 @@ int Mdrunner::mdrunner()
     if (thisRankHasPmeGpuTask)
     {
         GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building PME GPU program object.");
-        pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
+                (deviceStreamManager != nullptr),
+                "GPU device stream manager should be initialized in order to use GPU for PME.");
+        GMX_RELEASE_ASSERT((deviceInfo != nullptr),
+                           "GPU device should be initialized in order to use GPU for PME.");
+        pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
     }
 
     /* Initiate PME if necessary,
@@ -1478,10 +1502,23 @@ int Mdrunner::mdrunner()
         {
             try
             {
+                // TODO: This should be in the builder.
+                GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+                                   "Device stream manager should be valid in order to use GPU "
+                                   "version of PME.");
+                GMX_RELEASE_ASSERT(
+                        !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+                        "GPU PME stream should be valid in order to use GPU version of PME.");
+
+                const DeviceContext* deviceContext =
+                        useGpuForPme ? &deviceStreamManager->context() : nullptr;
+                const DeviceStream* pmeStream =
+                        useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+
                 pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
                                        nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
                                        ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
-                                       nullptr, deviceInfo, pmeGpuProgram.get(), mdlog);
+                                       nullptr, deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
             }
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
         }
@@ -1581,24 +1618,13 @@ int Mdrunner::mdrunner()
             && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
                 || runScheduleWork.simulationWork.useGpuBufferOps))
         {
-            const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
-            const DeviceStream* localStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
-                            : nullptr;
-            const DeviceStream* nonLocalStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
-                            : nullptr;
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
-            GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when building GPU propagator data object.");
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be initialized to use GPU.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    pme_gpu_get_block_size(fr->pmedata), wcycle);
+                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
             fr->stateGpu = stateGpu.get();
         }
 
@@ -1634,7 +1660,7 @@ int Mdrunner::mdrunner()
         /* do PME only */
         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode,
-                    deviceContext.get());
+                    deviceStreamManager.get());
     }
 
     wallcycle_stop(wcycle, ewcRUN);
@@ -1648,6 +1674,7 @@ int Mdrunner::mdrunner()
     // clean up cycle counter
     wallcycle_destroy(wcycle);
 
+    deviceStreamManager.reset(nullptr);
     // Free PME data
     if (pmedata)
     {
@@ -1695,7 +1722,6 @@ int Mdrunner::mdrunner()
     }
 
     free_gpu(deviceInfo);
-    deviceContext.reset(nullptr);
     sfree(fcd);
 
     if (doMembed)
@@ -1732,7 +1758,7 @@ int Mdrunner::mdrunner()
     }
 #endif
     return rc;
-}
+} // namespace gmx
 
 Mdrunner::~Mdrunner()
 {
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index f518692e7d..169cb1c153 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -59,6 +59,7 @@ struct t_QMMMrec;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class GpuBonded;
 class ForceProviders;
 class StatePropagatorDataGpu;
@@ -284,6 +285,8 @@ struct t_forcerec
     // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
     //       general StatePropagatorData object that is passed around
     gmx::StatePropagatorDataGpu* stateGpu = nullptr;
+    // TODO: Should not be here. This is here only to pass the pointer around.
+    gmx::DeviceStreamManager* deviceStreamManager = nullptr;
 
     //! GPU device context
     DeviceContext* deviceContext = nullptr;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index a4f77cbf16..bcc6dc5c38 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -66,6 +66,7 @@ struct gmx_wallcycle;
 
 namespace gmx
 {
+class DeviceStreamManager;
 
 class StatePropagatorDataGpu
 {
@@ -86,35 +87,15 @@ public:
      * ops are offloaded. This feature is currently not available in OpenCL and
      * hence these streams are not set in these builds.
      *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
      *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                           const DeviceStream*  localStream,
-                           const DeviceStream*  nonLocalStream,
-                           const DeviceContext& deviceContext,
-                           GpuApiCallBehavior   transferKind,
-                           int                  allocationBlockSizeDivisor,
-                           gmx_wallcycle*       wcycle);
+    StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                           GpuApiCallBehavior         transferKind,
+                           int                        allocationBlockSizeDivisor,
+                           gmx_wallcycle*             wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index 68c884f99b..269b6eb377 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -54,10 +54,7 @@ class StatePropagatorDataGpu::Impl
 {
 };
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
-                                               const DeviceStream* /* localStream     */,
-                                               const DeviceStream* /* nonLocalStream  */,
-                                               const DeviceContext& /* deviceContext   */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& /* deviceStreamManager */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index b057692547..fd9ff197ad 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -85,35 +85,15 @@ public:
      * ops are offloaded. This feature is currently not available in OpenCL and
      * hence these streams are not set in these builds.
      *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
      *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
      */
-    Impl(const DeviceStream*  pmeStream,
-         const DeviceStream*  localStream,
-         const DeviceStream*  nonLocalStream,
-         const DeviceContext& deviceContext,
-         GpuApiCallBehavior   transferKind,
-         int                  allocationBlockSizeDivisor,
-         gmx_wallcycle*       wcycle);
+    Impl(const DeviceStreamManager& deviceStreamManager,
+         GpuApiCallBehavior         transferKind,
+         int                        allocationBlockSizeDivisor,
+         gmx_wallcycle*             wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
@@ -346,9 +326,6 @@ private:
     //! GPU Update-constreaints stream.
     const DeviceStream* updateStream_;
 
-    //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
-    DeviceStream updateStreamOwn_;
-
     // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
     EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
     // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index e60e9fa73b..bf927f2da2 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -46,14 +46,9 @@
 
 #if GMX_GPU != GMX_GPU_NONE
 
-#    if GMX_GPU == GMX_GPU_CUDA
-#        include "gromacs/gpu_utils/cudautils.cuh"
-#    endif
+#    include "gromacs/gpu_utils/device_stream_manager.h"
 #    include "gromacs/gpu_utils/devicebuffer.h"
 #    include "gromacs/gpu_utils/gputraits.h"
-#    if GMX_GPU == GMX_GPU_OPENCL
-#        include "gromacs/gpu_utils/oclutils.h"
-#    endif
 #    include "gromacs/math/vectypes.h"
 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #    include "gromacs/timing/wallcycle.h"
@@ -65,55 +60,31 @@
 namespace gmx
 {
 
-StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
-                                   const DeviceStream*  localStream,
-                                   const DeviceStream*  nonLocalStream,
-                                   const DeviceContext& deviceContext,
-                                   GpuApiCallBehavior   transferKind,
-                                   int                  allocationBlockSizeDivisor,
-                                   gmx_wallcycle*       wcycle) :
-    deviceContext_(deviceContext),
+StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
+                                   GpuApiCallBehavior         transferKind,
+                                   int                        allocationBlockSizeDivisor,
+                                   gmx_wallcycle*             wcycle) :
+    deviceContext_(deviceStreamManager.context()),
     transferKind_(transferKind),
     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
 
-    // TODO: Refactor when the StreamManager is introduced.
+    // We need to keep local copies for re-initialization.
+    pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
+    localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
+    nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
+    // PME stream is used in OpenCL for H2D coordinate transfer
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
-        GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
-
-        // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_    = pmeStream;
-        updateStream_ = pmeStream;
-        GMX_UNUSED_VALUE(localStream);
-        GMX_UNUSED_VALUE(nonLocalStream);
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
     }
-
-    if (GMX_GPU == GMX_GPU_CUDA)
+    else
     {
-        if (pmeStream != nullptr)
-        {
-            pmeStream_ = pmeStream;
-        }
-        if (localStream != nullptr)
-        {
-            localStream_ = localStream;
-        }
-        if (nonLocalStream != nullptr)
-        {
-            nonLocalStream_ = nonLocalStream;
-        }
-
-        // TODO: The update stream should be created only when it is needed.
-#    if (GMX_GPU == GMX_GPU_CUDA)
-        // In CUDA we only need priority to create stream.
-        // (note that this will be moved from here in the follow-up patch)
-        updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
-        updateStream_ = &updateStreamOwn_;
-#    endif
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
     }
 
     // Map the atom locality to the stream that will be used for coordinates,
@@ -142,10 +113,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
 
-    GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
+    GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
     pmeStream_      = pmeStream;
     localStream_    = pmeStream; // For clearing the force buffer
     nonLocalStream_ = nullptr;
@@ -256,8 +228,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
 
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
@@ -291,8 +262,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
 
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
@@ -546,14 +516,11 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
 }
 
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                                               const DeviceStream*  localStream,
-                                               const DeviceStream*  nonLocalStream,
-                                               const DeviceContext& deviceContext,
-                                               GpuApiCallBehavior   transferKind,
-                                               int                  allocationBlockSizeDivisor,
-                                               gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                                               GpuApiCallBehavior         transferKind,
+                                               int            allocationBlockSizeDivisor,
+                                               gmx_wallcycle* wcycle) :
+    impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index c015326e8d..95aca5ba1d 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
  */
 void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 
     /* When we get here all misc operations issued in the local stream as well as
        the local xq H2D are done,
@@ -405,7 +405,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     cu_atomdata_t*      adat         = nb->atdat;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -485,7 +485,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     cu_nbparam_t*       nbp          = nb->nbparam;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -598,7 +598,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     cu_nbparam_t*       nbp          = nb->nbparam;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -732,7 +732,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     cu_atomdata_t*      adat         = nb->atdat;
     cu_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -836,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     const int                  numAtomsPerCell = grid.numAtomsPerCell();
     Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
 
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
 
     int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
     // avoid empty kernel launch, skip to inserting stream dependency
@@ -901,7 +901,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
 
     const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
-    const DeviceStream&       deviceStream = nb->deviceStreams[iLocality];
+    const DeviceStream&       deviceStream = *nb->deviceStreams[iLocality];
     cu_atomdata_t*            adat         = nb->atdat;
 
     size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index a76880b17e..6579d41004 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,7 +51,7 @@
 
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -413,16 +413,16 @@ static void cuda_init_const(NbnxmGpu*                       nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   bool                       bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   bool                            bLocalAndNonlocal)
 {
     cudaError_t stat;
 
     auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -444,8 +444,10 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     init_plist(nb->plist[InteractionLocality::Local]);
 
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
@@ -454,8 +456,12 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
          * priorities, because we are querying the priority range which in this
          * case will be a single value.
          */
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+        ;
     }
 
     /* init events for sychronization (timing disabled for performance reasons!) */
@@ -504,7 +510,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 {
     char                sbuf[STRLEN];
     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     cu_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
@@ -561,7 +567,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -576,7 +582,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -587,7 +593,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -616,7 +622,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                bDoTime      = nb->bDoTime;
     cu_timers_t*        timers       = nb->timers;
     cu_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -806,13 +812,6 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
 }
 
-const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
-{
-    assert(nb);
-
-    return &nb->deviceStreams[iloc];
-}
-
 void* gpu_get_xq(NbnxmGpu* nb)
 {
     assert(nb);
@@ -838,7 +837,7 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 {
-    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream  = *gpu_nbv->deviceStreams[InteractionLocality::Local];
     bool                bDoTime       = gpu_nbv->bDoTime;
     const int           maxNumColumns = gridSet.numColumnsMax();
 
@@ -929,7 +928,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                     GpuEventSynchronizer* const localReductionDone)
 {
 
-    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 
     GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
     gpu_nbv->localFReductionDone = localReductionDone;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index de5241a5fe..68d5da81c4 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -306,7 +306,7 @@ struct NbnxmGpu
     /*! \brief staging area where fshift/energies get downloaded */
     nb_staging_t nbst;
     /*! \brief local and non-local GPU streams */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index f5b3d813da..01f257e54f 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 
-            if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
+            if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
             {
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
@@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
         }
         else if (haveResultToWaitFor)
         {
-            nb->deviceStreams[iLocality].synchronize();
+            nb->deviceStreams[iLocality]->synchronize();
         }
 
         // TODO: this needs to be moved later because conditional wait could brake timing
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index 8e114d1c65..21fc8174b5 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -50,9 +50,6 @@
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/mdtypes/locality.h"
 
-class DeviceContext;
-class DeviceStream;
-
 struct NbnxmGpu;
 struct gmx_gpu_info_t;
 struct DeviceInformation;
@@ -62,12 +59,19 @@ struct NbnxnPairlistGpu;
 struct PairlistParams;
 struct interaction_const_t;
 
+class DeviceStream;
+
+namespace gmx
+{
+class DeviceStreamManager;
+}
+
 namespace Nbnxm
 {
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager gmx_unused& deviceStreamManager,
                    const interaction_const_t gmx_unused* ic,
                    const PairlistParams gmx_unused& listParams,
                    const nbnxn_atomdata_t gmx_unused* nbat,
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index a15f646ed9..d5ced753f7 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -120,7 +120,6 @@
 #include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/real.h"
 
-class DeviceContext;
 struct DeviceInformation;
 struct gmx_domdec_zones_t;
 struct gmx_enerdata_t;
@@ -144,6 +143,7 @@ class GpuEventSynchronizer;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class ForceWithShiftForces;
 class GpuBonded;
 template<typename>
@@ -404,16 +404,16 @@ namespace Nbnxm
 {
 
 /*! \brief Creates an Nbnxm object */
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle);
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   bool                 useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle);
 
 } // namespace Nbnxm
 
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp
index d65c59c91d..ea90c12b0c 100644
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -358,25 +358,24 @@ static int getMinimumIlistCountForGpuBalancing(NbnxmGpu* nbnxmGpu)
     return minimumIlistCount;
 }
 
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle)
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   const bool           useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle)
 {
     const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
-    const bool useGpu     = deviceInfo != nullptr;
 
-    GMX_RELEASE_ASSERT(!(emulateGpu && useGpu),
+    GMX_RELEASE_ASSERT(!(emulateGpu && useGpuForNonbonded),
                        "When GPU emulation is active, there cannot be a GPU assignment");
 
     NonbondedResource nonbondedResource;
-    if (useGpu)
+    if (useGpuForNonbonded)
     {
         nonbondedResource = NonbondedResource::Gpu;
     }
@@ -425,7 +424,8 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
         enbnxninitcombrule = enbnxninitcombruleNONE;
     }
 
-    auto pinPolicy = (useGpu ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
+    auto pinPolicy = (useGpuForNonbonded ? gmx::PinningPolicy::PinnedIfSupported
+                                         : gmx::PinningPolicy::CannotBePinned);
 
     auto nbat = std::make_unique<nbnxn_atomdata_t>(pinPolicy);
 
@@ -440,18 +440,18 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
     }
     nbnxn_atomdata_init(mdlog, nbat.get(), kernelSetup.kernelType, enbnxninitcombrule, fr->ntype,
                         fr->nbfp, mimimumNumEnergyGroupNonbonded,
-                        (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
+                        (useGpuForNonbonded || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
 
     NbnxmGpu* gpu_nbv                          = nullptr;
     int       minimumIlistCountForGpuBalancing = 0;
-    if (useGpu)
+    if (useGpuForNonbonded)
     {
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when to use GPU for non-bonded forces.");
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
-        gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
+        GMX_RELEASE_ASSERT(
+                (deviceStreamManager != nullptr),
+                "Device stream manager should be initialized in order to use GPU for non-bonded.");
+        gpu_nbv = gpu_init(*deviceStreamManager, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
 
         minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
     }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index ca6d9e4b19..a35a188400 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -487,7 +487,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     cl_atomdata_t*      adat         = nb->atdat;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -587,7 +587,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     cl_nbparam_t*       nbp          = nb->nbparam;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -725,7 +725,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     cl_nbparam_t*       nbp          = nb->nbparam;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     bool                bDoTime      = nb->bDoTime;
 
     if (plist->haveFreshList)
@@ -862,7 +862,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     cl_atomdata_t*      adat         = nb->atdat;
     cl_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index f11aa2d807..bc913e0e24 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -52,6 +52,7 @@
 
 #include <cmath>
 
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
@@ -485,7 +486,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
 
     cl_int           cl_error;
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     size_t local_work_size[3]  = { 1, 1, 1 };
     size_t global_work_size[3] = { 1, 1, 1 };
@@ -555,16 +556,16 @@ static void nbnxn_ocl_init_const(cl_atomdata_t*                  atomData,
 
 
 //! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   const bool                 bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
 {
     GMX_ASSERT(ic, "Need a valid interaction constants object");
 
     auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -578,6 +579,7 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     nb->timers = new cl_timers_t();
     snew(nb->timings, 1);
 
+    /* set device info, just point it to the right GPU among the detected ones */
     nb->dev_rundata = new gmx_device_runtime_data_t();
 
     /* init nbst */
@@ -591,15 +593,20 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
     }
 
     if (nb->bDoTime)
@@ -647,7 +654,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
     cl_int gmx_used_in_debug cl_error;
 
     cl_atomdata_t*   atomData = nb->atdat;
-    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local]->stream();
     cl_float         value    = 0.0F;
 
     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
@@ -669,7 +676,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 
     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
     cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
+    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
     GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 }
 
@@ -681,7 +688,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     // because getLastRangeTime() gets skipped with empty lists later
     // which leads to the counter not being reset.
     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     cl_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
@@ -740,7 +747,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -760,7 +767,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                bDoTime      = nb->bDoTime;
     cl_timers_t*        timers       = nb->timers;
     cl_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 6702907523..cda2294783 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -364,7 +364,7 @@ struct NbnxmGpu
     nb_staging_t nbst;
 
     //! local and non-local GPU queues
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */
-- 
2.22.0