From 6975fbfd22030a08bbe076151a3de69894a4de2f Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Sat, 1 Feb 2020 16:40:13 +0100
Subject: [PATCH] Take over management of OpenCL context from PME and NBNXM

This patch set creates the DeviceContext in runner and passes it to the
consumers (PME and NBNXM). This removes unnessesary management code
duplication, makes the device buffers in two modules compatible.

Fixes #2522
Fixes #3315
Refs #3311

Change-Id: I10358cfaced5b5c7dbdddf95679c9a9703f3a2c0
---
 src/gromacs/domdec/domdec.cpp                 |  8 ++-
 src/gromacs/domdec/domdec.h                   |  8 ++-
 src/gromacs/domdec/gpuhaloexchange.h          |  9 +++-
 src/gromacs/domdec/gpuhaloexchange_impl.cpp   |  1 +
 src/gromacs/domdec/gpuhaloexchange_impl.cu    | 26 +++++-----
 src/gromacs/domdec/gpuhaloexchange_impl.cuh   | 12 +++--
 src/gromacs/ewald/pme.h                       |  7 ---
 src/gromacs/ewald/pme_gpu.cpp                 |  8 ---
 src/gromacs/ewald/pme_gpu_internal.cpp        |  8 ---
 src/gromacs/ewald/pme_gpu_internal.h          |  7 ---
 src/gromacs/ewald/pme_gpu_program.cpp         | 11 ++---
 src/gromacs/ewald/pme_gpu_program.h           |  7 ++-
 src/gromacs/ewald/pme_gpu_program_impl.cpp    |  4 +-
 src/gromacs/ewald/pme_gpu_program_impl.cu     |  5 +-
 src/gromacs/ewald/pme_gpu_program_impl.h      |  7 ++-
 .../ewald/pme_gpu_program_impl_ocl.cpp        |  4 +-
 src/gromacs/ewald/pme_only.cpp                | 11 +++--
 src/gromacs/ewald/pme_only.h                  |  4 +-
 src/gromacs/ewald/pme_pp_comm_gpu.h           |  6 ++-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp    | 21 ++++----
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     | 11 ++++-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |  7 ++-
 src/gromacs/ewald/tests/pmegathertest.cpp     |  4 +-
 .../ewald/tests/pmesplinespreadtest.cpp       |  4 +-
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  9 ++--
 src/gromacs/ewald/tests/pmetestcommon.h       |  3 +-
 .../ewald/tests/testhardwarecontexts.cpp      |  6 ++-
 .../ewald/tests/testhardwarecontexts.h        | 29 ++++++++---
 src/gromacs/gpu_utils/device_context.h        | 14 ++----
 src/gromacs/gpu_utils/device_context_ocl.cpp  | 12 +----
 src/gromacs/gpu_utils/device_context_ocl.h    | 13 ++---
 src/gromacs/gpu_utils/oclutils.h              |  5 +-
 src/gromacs/listed_forces/gpubonded.h         |  6 ++-
 src/gromacs/listed_forces/gpubonded_impl.cpp  |  5 +-
 src/gromacs/listed_forces/gpubonded_impl.cu   | 14 ++++--
 src/gromacs/listed_forces/gpubonded_impl.h    |  6 +--
 src/gromacs/mdlib/forcerec.h                  |  1 -
 src/gromacs/mdlib/leapfrog_gpu.cuh            |  2 +-
 src/gromacs/mdlib/lincs_gpu.cuh               |  2 +-
 src/gromacs/mdlib/settle_gpu.cuh              |  2 +-
 src/gromacs/mdlib/update_constrain_gpu.h      |  3 ++
 .../mdlib/update_constrain_gpu_impl.cpp       |  1 +
 .../mdlib/update_constrain_gpu_impl.cu        |  6 ++-
 src/gromacs/mdlib/update_constrain_gpu_impl.h | 11 +++--
 src/gromacs/mdrun/md.cpp                      | 14 ++++--
 src/gromacs/mdrun/runner.cpp                  | 49 +++++++++++++++----
 src/gromacs/mdtypes/forcerec.h                |  4 ++
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  3 +-
 src/gromacs/nbnxm/gpu_data_mgmt.h             |  3 ++
 src/gromacs/nbnxm/nbnxm.h                     |  2 +
 src/gromacs/nbnxm/nbnxm_setup.cpp             |  7 ++-
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      | 39 +++++++--------
 .../nbnxm/opencl/nbnxm_ocl_jit_support.cpp    |  8 +--
 53 files changed, 289 insertions(+), 190 deletions(-)

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index e42765cc8f..e020a1405b 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3200,7 +3200,11 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
     return bCutoffAllowed;
 }
 
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal)
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
+                              const t_commrec&     cr,
+                              const DeviceContext& deviceContext,
+                              void*                streamLocal,
+                              void*                streamNonLocal)
 {
 
     int gpuHaloExchangeSize = 0;
@@ -3224,7 +3228,7 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, v
         for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
         {
             cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd, cr.mpi_comm_mysim, streamLocal, streamNonLocal, pulse));
+                    cr.dd, cr.mpi_comm_mysim, deviceContext, streamLocal, streamNonLocal, pulse));
         }
     }
 }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 51aba44e5c..0a7aa3202e 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -85,6 +85,7 @@ struct t_nrnb;
 struct gmx_wallcycle;
 enum class PbcType : int;
 class t_state;
+class DeviceContext;
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -314,10 +315,15 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
 /*! \brief Construct the GPU halo exchange object(s)
  * \param[in] mdlog          The logger object
  * \param[in] cr             The commrec object
+ * \param[in] deviceContext  GPU device context
  * \param[in] streamLocal    The local GPU stream
  * \param[in] streamNonLocal The non-local GPU stream
  */
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal);
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
+                              const t_commrec&     cr,
+                              const DeviceContext& deviceContext,
+                              void*                streamLocal,
+                              void*                streamNonLocal);
 
 /*! \brief
  * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index dc65cb93d3..851e3d1983 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -49,6 +49,7 @@
 #include "gromacs/utility/gmxmpi.h"
 
 struct gmx_domdec_t;
+class DeviceContext;
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -80,11 +81,17 @@ public:
      *
      * \param [inout] dd                       domdec structure
      * \param [in]    mpi_comm_mysim           communicator used for simulation
+     * \param [in]    deviceContext            GPU device context
      * \param [in]    streamLocal              local NB CUDA stream.
      * \param [in]    streamNonLocal           non-local NB CUDA stream.
      * \param [in]    pulse                    the communication pulse for this instance
      */
-    GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal, int pulse);
+    GpuHaloExchange(gmx_domdec_t*        dd,
+                    MPI_Comm             mpi_comm_mysim,
+                    const DeviceContext& deviceContext,
+                    void*                streamLocal,
+                    void*                streamNonLocal,
+                    int                  pulse);
     ~GpuHaloExchange();
 
     /*! \brief
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index 1ce9a9d93e..c8ca5df8c2 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -62,6 +62,7 @@ class GpuHaloExchange::Impl
 /*!\brief Constructor stub. */
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                  MPI_Comm /* mpi_comm_mysim */,
+                                 const DeviceContext& /* deviceContext */,
                                  void* /*streamLocal */,
                                  void* /*streamNonLocal */,
                                  int /*pulse */) :
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 92a1d9f3d5..4a44beb3e6 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -54,6 +54,7 @@
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/typecasts.cuh"
@@ -415,11 +416,12 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
 }
 
 /*! \brief Create Domdec GPU object */
-GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
-                            MPI_Comm      mpi_comm_mysim,
-                            void*         localStream,
-                            void*         nonLocalStream,
-                            int           pulse) :
+GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
+                            MPI_Comm             mpi_comm_mysim,
+                            const DeviceContext& deviceContext,
+                            void*                localStream,
+                            void*                nonLocalStream,
+                            int                  pulse) :
     dd_(dd),
     sendRankX_(dd->neighbor[0][1]),
     recvRankX_(dd->neighbor[0][0]),
@@ -428,6 +430,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
     usePBC_(dd->ci[dd->dim[0]] == 0),
     haloDataTransferLaunched_(new GpuEventSynchronizer()),
     mpi_comm_mysim_(mpi_comm_mysim),
+    deviceContext_(deviceContext),
     localStream_(*static_cast<cudaStream_t*>(localStream)),
     nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
     pulse_(pulse)
@@ -460,12 +463,13 @@ GpuHaloExchange::Impl::~Impl()
     delete haloDataTransferLaunched_;
 }
 
-GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
-                                 MPI_Comm      mpi_comm_mysim,
-                                 void*         localStream,
-                                 void*         nonLocalStream,
-                                 int           pulse) :
-    impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, pulse))
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
+                                 MPI_Comm             mpi_comm_mysim,
+                                 const DeviceContext& deviceContext,
+                                 void*                localStream,
+                                 void*                nonLocalStream,
+                                 int                  pulse) :
+    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
 {
 }
 
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index a8d2f9204c..ba22bc5262 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -71,11 +71,17 @@ public:
      *
      * \param [inout] dd                       domdec structure
      * \param [in]    mpi_comm_mysim           communicator used for simulation
+     * \param [in]    deviceContext            GPU device context
      * \param [in]    localStream              local NB CUDA stream
      * \param [in]    nonLocalStream           non-local NB CUDA stream
      * \param [in]    pulse                    the communication pulse for this instance
      */
-    Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream, int pulse);
+    Impl(gmx_domdec_t*        dd,
+         MPI_Comm             mpi_comm_mysim,
+         const DeviceContext& deviceContext,
+         void*                localStream,
+         void*                nonLocalStream,
+         int                  pulse);
     ~Impl();
 
     /*! \brief
@@ -176,8 +182,8 @@ private:
     GpuEventSynchronizer* haloDataTransferLaunched_ = nullptr;
     //! MPI communicator used for simulation
     MPI_Comm mpi_comm_mysim_;
-    //! Dummy GPU context object
-    const DeviceContext deviceContext_;
+    //! GPU context object
+    const DeviceContext& deviceContext_;
     //! CUDA stream for local non-bonded calculations
     cudaStream_t localStream_ = nullptr;
     //! CUDA stream for non-local non-bonded calculations
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 40a34682c0..1c3cb9b774 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -436,13 +436,6 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT
 GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Returns the pointer to the GPU context.
- *  \param[in] pme            The PME data structure.
- *  \returns                  Pointer to GPU context object.
- */
-GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to sychronizer
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index b4cec47135..4c4ed4851d 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -442,14 +442,6 @@ void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
     return pme_gpu_get_stream(pme->gpu);
 }
 
-const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* pme)
-{
-    GMX_RELEASE_ASSERT(pme, "GPU context requested from PME before PME was constructed.");
-    GMX_RELEASE_ASSERT(pme_gpu_active(pme),
-                       "GPU context requested from PME, but PME is running on the CPU.");
-    return pme_gpu_get_context(pme->gpu);
-}
-
 GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index bd3b25e1cd..dd62e8c4cd 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1527,14 +1527,6 @@ void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
     }
 }
 
-const DeviceContext* pme_gpu_get_context(const PmeGpu* pmeGpu)
-{
-    GMX_RELEASE_ASSERT(
-            pmeGpu,
-            "GPU context object was requested, but PME GPU object was not (yet) initialized.");
-    return &pmeGpu->archSpecific->deviceContext_;
-}
-
 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index a9dc9677ce..67a1bc3d1c 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -408,13 +408,6 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_A
 GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Return pointer to GPU context (for OpenCL builds).
- * \param[in] pmeGpu         The PME GPU structure.
- * \returns                  Pointer to context object.
- */
-GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to sync object
diff --git a/src/gromacs/ewald/pme_gpu_program.cpp b/src/gromacs/ewald/pme_gpu_program.cpp
index 6b34a41c4c..23981a661b 100644
--- a/src/gromacs/ewald/pme_gpu_program.cpp
+++ b/src/gromacs/ewald/pme_gpu_program.cpp
@@ -53,17 +53,14 @@
 
 #include "pme_gpu_program_impl.h"
 
-PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo) :
-    impl_(std::make_unique<PmeGpuProgramImpl>(deviceInfo))
+PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
+    impl_(std::make_unique<PmeGpuProgramImpl>(deviceInfo, deviceContext))
 {
 }
 
 PmeGpuProgram::~PmeGpuProgram() = default;
 
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation* deviceInfo)
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext)
 {
-    GMX_RELEASE_ASSERT(
-            deviceInfo != nullptr,
-            "Device information can not be nullptr when building PME GPU program object.");
-    return std::make_unique<PmeGpuProgram>(*deviceInfo);
+    return std::make_unique<PmeGpuProgram>(deviceInfo, deviceContext);
 }
diff --git a/src/gromacs/ewald/pme_gpu_program.h b/src/gromacs/ewald/pme_gpu_program.h
index 32c33442eb..d4dbdf449d 100644
--- a/src/gromacs/ewald/pme_gpu_program.h
+++ b/src/gromacs/ewald/pme_gpu_program.h
@@ -49,13 +49,15 @@
 
 #include <memory>
 
+class DeviceContext;
+
 struct PmeGpuProgramImpl;
 struct DeviceInformation;
 
 class PmeGpuProgram
 {
 public:
-    explicit PmeGpuProgram(const DeviceInformation& deviceInfo);
+    explicit PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
     ~PmeGpuProgram();
 
     // TODO: design getters for information inside, if needed for PME, and make this private?
@@ -69,6 +71,7 @@ using PmeGpuProgramStorage = std::unique_ptr<PmeGpuProgram>;
 /*! \brief
  * Factory function used to build persistent PME GPU program for the device at once.
  */
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation* /*deviceInfo*/);
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& /*deviceInfo*/,
+                                        const DeviceContext& /* deviceContext */);
 
 #endif
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cpp b/src/gromacs/ewald/pme_gpu_program_impl.cpp
index af57c03e9f..ccaffa5acd 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cpp
@@ -45,7 +45,9 @@
 
 #include "pme_gpu_program_impl.h"
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
+                                     const DeviceContext& deviceContext) :
+    deviceContext_(deviceContext),
     warpSize(0),
     spreadWorkGroupSize(0),
     gatherWorkGroupSize(0),
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu
index d17e18f50c..53bf2f0d1e 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -98,8 +98,9 @@ extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, true
 extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
 extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo) :
-    deviceContext_(deviceInfo)
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
+                                     const DeviceContext& deviceContext) :
+    deviceContext_(deviceContext)
 {
     // kernel parameters
     warpSize              = warp_size;
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h
index 1de5014821..cb1471abf1 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -48,6 +48,7 @@
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/utility/classhelpers.h"
 
+class DeviceContext;
 struct DeviceInformation;
 
 /*! \internal
@@ -75,10 +76,8 @@ struct PmeGpuProgramImpl
     /*! \brief
      * This is a handle to the GPU context, which is just a dummy in CUDA,
      * but is created/destroyed by this class in OpenCL.
-     * TODO: Later we want to be able to own the context at a higher level and not here,
-     * but this class would still need the non-owning context handle to build the kernels.
      */
-    DeviceContext deviceContext_;
+    const DeviceContext& deviceContext_;
 
     //! Conveniently all the PME kernels use the same single argument type
 #if GMX_GPU == GMX_GPU_CUDA
@@ -147,7 +146,7 @@ struct PmeGpuProgramImpl
 
     PmeGpuProgramImpl() = delete;
     //! Constructor for the given device
-    explicit PmeGpuProgramImpl(const DeviceInformation& deviceInfo);
+    explicit PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
     ~PmeGpuProgramImpl();
     GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
 
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
index 4071beebdb..1fa443ee4e 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -53,8 +53,8 @@
 #include "pme_gpu_types_host.h"
 #include "pme_grid.h"
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo) :
-    deviceContext_(deviceInfo)
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
+    deviceContext_(deviceContext)
 {
     // kernel parameters
     warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 2ee17b3267..845b1a33ec 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -603,7 +603,8 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
                 gmx_wallcycle*            wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
                 t_inputrec*               ir,
-                PmeRunMode                runMode)
+                PmeRunMode                runMode,
+                const DeviceContext*      deviceContext)
 {
     int     ret;
     int     natoms = 0;
@@ -628,8 +629,7 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
     const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
     if (useGpuForPme)
     {
-        const void*          commandStream = pme_gpu_get_device_stream(pme);
-        const DeviceContext& deviceContext = *pme_gpu_get_device_context(pme);
+        const void* commandStream = pme_gpu_get_device_stream(pme);
 
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
@@ -640,10 +640,13 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
                     commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
         }
+        GMX_RELEASE_ASSERT(
+                deviceContext != nullptr,
+                "Device context can not be nullptr when building GPU propagator data object.");
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
         stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                commandStream, deviceContext, GpuApiCallBehavior::Async,
+                commandStream, *deviceContext, GpuApiCallBehavior::Async,
                 pme_gpu_get_padding_size(pme), wcycle);
     }
 
diff --git a/src/gromacs/ewald/pme_only.h b/src/gromacs/ewald/pme_only.h
index 0ed37f1e2e..18edbb9b43 100644
--- a/src/gromacs/ewald/pme_only.h
+++ b/src/gromacs/ewald/pme_only.h
@@ -55,6 +55,7 @@ struct t_nrnb;
 struct gmx_pme_t;
 struct gmx_wallcycle;
 
+class DeviceContext;
 enum class PmeRunMode;
 
 /*! \brief Called on the nodes that do PME exclusively */
@@ -64,6 +65,7 @@ int gmx_pmeonly(gmx_pme_t*                pme,
                 gmx_wallcycle*            wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
                 t_inputrec*               ir,
-                PmeRunMode                runMode);
+                PmeRunMode                runMode,
+                const DeviceContext*      deviceContext);
 
 #endif
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h
index e9d8c4ff69..ea750cc17c 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,6 +45,7 @@
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
+class DeviceContext;
 class GpuEventSynchronizer;
 
 namespace gmx
@@ -61,8 +62,9 @@ public:
     /*! \brief Creates PME-PP GPU communication object
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
+     * \param[in] deviceContext   GPU context.
      */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank);
+    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
     ~PmePpCommGpu();
 
     /*! \brief Perform steps required when buffer size changes
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
index 0b59ff9212..b8befc5311 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -62,7 +62,8 @@ class PmePpCommGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank) : impl_(nullptr)
+PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, const DeviceContext& /* deviceContext */) :
+    impl_(nullptr)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
@@ -72,26 +73,26 @@ PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank) : i
 PmePpCommGpu::~PmePpCommGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmePpCommGpu::reinit(int gmx_unused size)
+void PmePpCommGpu::reinit(int /* size */)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
                "correct implementation.");
 }
 
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void gmx_unused* recvPtr,
-                                                 int gmx_unused recvSize,
-                                                 bool gmx_unused receivePmeForceToGpu)
+void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* /* recvPtr */,
+                                                 int /* recvSize */,
+                                                 bool /* receivePmeForceToGpu */)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
 }
 
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused* sendPtr,
-                                                  int gmx_unused sendSize,
-                                                  bool gmx_unused sendPmeCoordinatesFromGpu,
-                                                  GpuEventSynchronizer gmx_unused* coordinatesOnDeviceEvent)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
+                                                  int /* sendSize */,
+                                                  bool /* sendPmeCoordinatesFromGpu */,
+                                                  GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 29cb73d0ca..e6e5bacd16 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -48,6 +48,7 @@
 #include "config.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/gmxmpi.h"
@@ -55,7 +56,10 @@
 namespace gmx
 {
 
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank) : comm_(comm), pmeRank_(pmeRank)
+PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+    comm_(comm),
+    pmeRank_(pmeRank),
+    deviceContext_(deviceContext)
 {
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
@@ -152,7 +156,10 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
     return static_cast<void*>(&forcesReadySynchronizer_);
 }
 
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank) : impl_(new Impl(comm, pmeRank)) {}
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+    impl_(new Impl(comm, pmeRank, deviceContext))
+{
+}
 
 PmePpCommGpu::~PmePpCommGpu() = default;
 
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 5565bea370..c791ea5b40 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -59,8 +59,9 @@ public:
     /*! \brief Creates PME-PP GPU communication object.
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
+     * \param[in] deviceContext   GPU context.
      */
-    Impl(MPI_Comm comm, int pmeRank);
+    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
     ~Impl();
 
     /*! \brief Perform steps required when buffer size changes
@@ -115,6 +116,8 @@ public:
     void* getForcesReadySynchronizer();
 
 private:
+    //! Device context object
+    const DeviceContext& deviceContext_;
     //! CUDA stream used for the communication operations in this class
     cudaStream_t pmePpCommStream_ = nullptr;
     //! Remote location of PME coordinate data buffer
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp
index 08d2716e20..59efd56fac 100644
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -300,7 +300,9 @@ public:
             PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
                                                     context->getPmeGpuProgram(), box);
             std::unique_ptr<StatePropagatorDataGpu> stateGpu =
-                    (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr;
+                    (codePath == CodePath::GPU)
+                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                            : nullptr;
 
             pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates,
                          inputAtomData.charges);
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
index fba028fa8c..8f2935b9f3 100644
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -152,7 +152,9 @@ public:
                 PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
                                                         context->getPmeGpuProgram(), box);
                 std::unique_ptr<StatePropagatorDataGpu> stateGpu =
-                        (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr;
+                        (codePath == CodePath::GPU)
+                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                                : nullptr;
 
                 pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
 
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 81edf195fe..787f3e9f42 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -160,14 +160,15 @@ PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
 }
 
 //! Make a GPU state-propagator manager
-std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t& pme)
+std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
+                                                                   const DeviceContext& deviceContext)
 {
     // TODO: Pin the host buffer and use async memory copies
     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
     //       restrict one from using other constructor here.
-    return std::make_unique<StatePropagatorDataGpu>(
-            pme_gpu_get_device_stream(&pme), *pme_gpu_get_device_context(&pme),
-            GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr);
+    return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
+                                                    GpuApiCallBehavior::Sync,
+                                                    pme_gpu_get_padding_size(&pme), nullptr);
 }
 
 //! PME initialization with atom data
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h
index ed919e80f8..c67f78bacf 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -135,7 +135,8 @@ PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
                             real             ewaldCoeff_q  = 0.0F,
                             real             ewaldCoeff_lj = 0.0F);
 //! Make a GPU state-propagator manager
-std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t& pme);
+std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
+                                                                   const DeviceContext& deviceContext);
 //! PME initialization with atom data and system box
 void pmeInitAtoms(gmx_pme_t*               pme,
                   StatePropagatorDataGpu*  stateGpu,
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
index 9747d0376e..661f0fa4bb 100644
--- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -108,7 +108,7 @@ static gmx_hw_info_t* hardwareInit()
 
 void PmeTestEnvironment::SetUp()
 {
-    hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "(CPU) ", nullptr));
+    hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "(CPU) "));
 
     hardwareInfo_ = hardwareInit();
     if (!pme_gpu_supports_build(nullptr) || !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
@@ -120,13 +120,15 @@ void PmeTestEnvironment::SetUp()
     for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
     {
         const DeviceInformation* deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
+        GMX_RELEASE_ASSERT(deviceInfo != nullptr,
+                           "Device information should be provided for the GPU builds.");
         init_gpu(deviceInfo);
 
         char stmp[200] = {};
         get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
         std::string description = "(GPU " + std::string(stmp) + ") ";
         hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(
-                CodePath::GPU, description.c_str(), deviceInfo));
+                CodePath::GPU, description.c_str(), *deviceInfo));
     }
 }
 
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h
index 9846cbbb07..03df38671c 100644
--- a/src/gromacs/ewald/tests/testhardwarecontexts.h
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.h
@@ -49,6 +49,7 @@
 #include <gtest/gtest.h>
 
 #include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/gmxassert.h"
 
@@ -80,6 +81,8 @@ struct TestHardwareContext
     std::string description_;
     //! Device information pointer
     const DeviceInformation* deviceInfo_;
+    //! Local copy of the device context pointer
+    DeviceContext deviceContext_;
     //! Persistent compiled GPU kernels for PME.
     PmeGpuProgramStorage program_;
 
@@ -88,20 +91,32 @@ public:
     CodePath getCodePath() const { return codePath_; }
     //! Returns a human-readable context description line
     std::string getDescription() const { return description_; }
+    //! Getter for the DeviceContext
+    const DeviceContext& deviceContext() const { return deviceContext_; }
     //! Returns the device info pointer
     const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
     //! Returns the persistent PME GPU kernels
     const PmeGpuProgram* getPmeGpuProgram() const { return program_.get(); }
-    //! Constructs the context
-    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation* deviceInfo) :
+    //! Constructs the context for CPU builds
+    TestHardwareContext(CodePath codePath, const char* description) :
+        codePath_(codePath),
+        description_(description)
+    {
+        GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
+                           "A GPU code path should provide DeviceInformation to the "
+                           "TestHerdwareContext constructor.");
+    }
+    //! Constructs the context for GPU builds
+    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
         codePath_(codePath),
         description_(description),
-        deviceInfo_(deviceInfo)
+        deviceInfo_(&deviceInfo),
+        deviceContext_(deviceInfo),
+        program_(buildPmeGpuProgram(deviceInfo, deviceContext_))
     {
-        if (codePath == CodePath::GPU)
-        {
-            program_ = buildPmeGpuProgram(deviceInfo_);
-        }
+        GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
+                           "TestHerdwareContext tries to construct DeviceContext and PmeGpuProgram "
+                           "in CPU build.");
     }
     ~TestHardwareContext();
 };
diff --git a/src/gromacs/gpu_utils/device_context.h b/src/gromacs/gpu_utils/device_context.h
index d192b5543f..84fc076708 100644
--- a/src/gromacs/gpu_utils/device_context.h
+++ b/src/gromacs/gpu_utils/device_context.h
@@ -61,18 +61,10 @@ struct DeviceInformation;
 class DeviceContext
 {
 public:
-    //! Default constructor. In OpenCL leaves context \c nullptr.
+    //! Default constructor.
     DeviceContext() {}
-    /*! \brief Second stage of construction. Creates the \c cl_context in OpenCL, does nothing in CUDA.
-     *
-     * \param[in] deviceInfo Platform-specific device information.
-     */
-    void init(const DeviceInformation& /*deviceInfo*/) {}
-    /*! \brief Construct the object and call \c init(...) .
-     *
-     * \param[in] deviceInfo Platform-specific device information.
-     */
-    DeviceContext(const DeviceInformation& deviceInfo) { init(deviceInfo); }
+    //! Constructor.
+    DeviceContext(const DeviceInformation& /* deviceInfo */) {}
     //! Destructor
     ~DeviceContext() = default;
 
diff --git a/src/gromacs/gpu_utils/device_context_ocl.cpp b/src/gromacs/gpu_utils/device_context_ocl.cpp
index 6f86b17444..1cd6623903 100644
--- a/src/gromacs/gpu_utils/device_context_ocl.cpp
+++ b/src/gromacs/gpu_utils/device_context_ocl.cpp
@@ -60,12 +60,7 @@
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
 /**@}*/
 
-DeviceContext::DeviceContext()
-{
-    context_ = nullptr;
-}
-
-void DeviceContext::init(const DeviceInformation& deviceInfo)
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
 {
     cl_platform_id                     platformId = deviceInfo.oclPlatformId;
     cl_device_id                       deviceId   = deviceInfo.oclDeviceId;
@@ -92,11 +87,6 @@ void DeviceContext::init(const DeviceInformation& deviceInfo)
     }
 }
 
-DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
-{
-    init(deviceInfo);
-}
-
 DeviceContext::~DeviceContext()
 {
     cl_int clError;
diff --git a/src/gromacs/gpu_utils/device_context_ocl.h b/src/gromacs/gpu_utils/device_context_ocl.h
index 70e3927cf8..a9b84b2f8e 100644
--- a/src/gromacs/gpu_utils/device_context_ocl.h
+++ b/src/gromacs/gpu_utils/device_context_ocl.h
@@ -57,16 +57,9 @@ struct DeviceInformation;
 class DeviceContext
 {
 public:
-    //! Default constructor. Sets \c context_ to \c nullptr.
-    DeviceContext();
-    /*! \brief Second stage of construction. Creates the \c cl_context.
-     *
-     * \param[in] deviceInfo Platform-specific device information.
-     *
-     * \throws InternalError if context creation failed.
-     */
-    void init(const DeviceInformation& deviceInfo);
-    /*! \brief Construct the object and call \c init(...) .
+    //! Default constructor.
+    DeviceContext() {}
+    /*! \brief Constructor that creates the \c cl_context
      *
      * \param[in] deviceInfo Platform-specific device information.
      *
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 230b3ff94e..90f5b04bfe 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -64,8 +64,11 @@ enum class GpuApiCallBehavior;
  */
 struct gmx_device_runtime_data_t
 {
+    //! Constructor
+    gmx_device_runtime_data_t(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
+
     //! OpenCL context
-    DeviceContext deviceContext;
+    const DeviceContext& deviceContext_;
     //! OpenCL program
     cl_program program;
 };
diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h
index 1a231d2c2c..b1c69d4572 100644
--- a/src/gromacs/listed_forces/gpubonded.h
+++ b/src/gromacs/listed_forces/gpubonded.h
@@ -55,6 +55,7 @@
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 
+class DeviceContext;
 struct gmx_enerdata_t;
 struct gmx_ffparams_t;
 struct gmx_mtop_t;
@@ -106,7 +107,10 @@ class GpuBonded
 {
 public:
     //! Construct the manager with constant data and the stream to use.
-    GpuBonded(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallcycle* wcycle);
+    GpuBonded(const gmx_ffparams_t& ffparams,
+              const DeviceContext&  deviceContext,
+              void*                 streamPtr,
+              gmx_wallcycle*        wcycle);
     //! Destructor
     ~GpuBonded();
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp
index 94a2b5d42b..f24103229b 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -160,7 +160,10 @@ class GpuBonded::Impl
 {
 };
 
-GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */, void* /*streamPtr */, gmx_wallcycle* /* wcycle */) :
+GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */,
+                     const DeviceContext& /* deviceContext */,
+                     void* /*streamPtr */,
+                     gmx_wallcycle* /* wcycle */) :
     impl_(nullptr)
 {
 }
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index ff7092f40c..763550c5c9 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -50,6 +50,7 @@
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/mdtypes/enerdata.h"
@@ -63,7 +64,11 @@ namespace gmx
 
 // ---- GpuBonded::Impl
 
-GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallcycle* wcycle)
+GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
+                      const DeviceContext&  deviceContext,
+                      void*                 streamPtr,
+                      gmx_wallcycle*        wcycle) :
+    deviceContext_(deviceContext)
 {
     stream_ = *static_cast<CommandStream*>(streamPtr);
     wcycle_ = wcycle;
@@ -306,8 +311,11 @@ void GpuBonded::Impl::clearEnergies()
 
 // ---- GpuBonded
 
-GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallcycle* wcycle) :
-    impl_(new Impl(ffparams, streamPtr, wcycle))
+GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams,
+                     const DeviceContext&  deviceContext,
+                     void*                 streamPtr,
+                     gmx_wallcycle*        wcycle) :
+    impl_(new Impl(ffparams, deviceContext, streamPtr, wcycle))
 {
 }
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h
index 0532b40315..a0da918893 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -126,7 +126,7 @@ class GpuBonded::Impl
 {
 public:
     //! Constructor
-    Impl(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallcycle* wcycle);
+    Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, void* streamPtr, gmx_wallcycle* wcycle);
     /*! \brief Destructor, non-default needed for freeing
      * device-side buffers */
     ~Impl();
@@ -180,8 +180,8 @@ private:
     //! \brief Device-side total virial
     float* d_vTot_ = nullptr;
 
-    //! Dummy GPU context object
-    const DeviceContext deviceContext_;
+    //! GPU context object
+    const DeviceContext& deviceContext_;
     //! \brief Bonded GPU stream, not owned by this module
     CommandStream stream_;
 
diff --git a/src/gromacs/mdlib/forcerec.h b/src/gromacs/mdlib/forcerec.h
index c7f38cb5f5..fea69c594f 100644
--- a/src/gromacs/mdlib/forcerec.h
+++ b/src/gromacs/mdlib/forcerec.h
@@ -42,7 +42,6 @@
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/arrayref.h"
 
-struct DeviceInformation;
 struct gmx_hw_info_t;
 struct t_commrec;
 struct t_fcdata;
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh
index 98703c05b9..26a6fc7399 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cuh
+++ b/src/gromacs/mdlib/leapfrog_gpu.cuh
@@ -112,7 +112,7 @@ public:
     class Impl;
 
 private:
-    //! Dummy GPU context object
+    //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh
index 77423dc323..4817573b80 100644
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -169,7 +169,7 @@ public:
     static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
 
 private:
-    //! Dummy GPU context object
+    //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh
index f07af017e3..da8bafd8df 100644
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -252,7 +252,7 @@ public:
     void set(const InteractionDefinitions& idef, const t_mdatoms& md);
 
 private:
-    //! Dummy GPU context object
+    //! GPU context object
     const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h
index 09f0bbecc1..61f8537efa 100644
--- a/src/gromacs/mdlib/update_constrain_gpu.h
+++ b/src/gromacs/mdlib/update_constrain_gpu.h
@@ -49,6 +49,7 @@
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 
+class DeviceContext;
 class GpuEventSynchronizer;
 
 struct gmx_mtop_t;
@@ -77,11 +78,13 @@ public:
      *                              projection from it.
      * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
      *                              and target O-H and H-H distances from this object.
+     * \param[in] deviceContext     GPU device context.
      * \param[in] commandStream     GPU stream to use. Can be nullptr.
      * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
      */
     UpdateConstrainGpu(const t_inputrec&     ir,
                        const gmx_mtop_t&     mtop,
+                       const DeviceContext&  deviceContext,
                        const void*           commandStream,
                        GpuEventSynchronizer* xUpdatedOnDevice);
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
index 3e10f8a403..45a0743384 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -57,6 +57,7 @@ class UpdateConstrainGpu::Impl
 
 UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
                                        const gmx_mtop_t& /* mtop */,
+                                       const DeviceContext& /* deviceContext */,
                                        const void* /* commandStream */,
                                        GpuEventSynchronizer* /* xUpdatedOnDevice */) :
     impl_(nullptr)
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index a8e5a94cc6..41f7572332 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -57,6 +57,7 @@
 #include <algorithm>
 
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -166,8 +167,10 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
 
 UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                                const gmx_mtop_t&     mtop,
+                               const DeviceContext&  deviceContext,
                                const void*           commandStream,
                                GpuEventSynchronizer* xUpdatedOnDevice) :
+    deviceContext_(deviceContext),
     coordinatesReady_(xUpdatedOnDevice)
 {
     GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
@@ -231,9 +234,10 @@ GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync()
 
 UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
                                        const gmx_mtop_t&     mtop,
+                                       const DeviceContext&  deviceContext,
                                        const void*           commandStream,
                                        GpuEventSynchronizer* xUpdatedOnDevice) :
-    impl_(new Impl(ir, mtop, commandStream, xUpdatedOnDevice))
+    impl_(new Impl(ir, mtop, deviceContext, commandStream, xUpdatedOnDevice))
 {
 }
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 75b6814de0..dd46010e93 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -75,10 +75,15 @@ public:
      *                              projection from it.
      * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
      *                              and target O-H and H-H distances from this object.
+     * \param[in] deviceContext     GPU device context.
      * \param[in] commandStream     GPU stream to use. Can be nullptr.
      * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
      */
-    Impl(const t_inputrec& ir, const gmx_mtop_t& mtop, const void* commandStream, GpuEventSynchronizer* xUpdatedOnDevice);
+    Impl(const t_inputrec&     ir,
+         const gmx_mtop_t&     mtop,
+         const DeviceContext&  deviceContext,
+         const void*           commandStream,
+         GpuEventSynchronizer* xUpdatedOnDevice);
 
     ~Impl();
 
@@ -163,8 +168,8 @@ public:
     static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
 
 private:
-    //! Dummy GPU context object
-    const DeviceContext deviceContext_;
+    //! GPU context object
+    const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_ = nullptr;
     //! GPU kernel launch config
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 2cb1388dca..941a7030c9 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -400,8 +400,13 @@ void gmx::LegacySimulator::do_md()
         {
             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
         }
-        integrator = std::make_unique<UpdateConstrainGpu>(
-                *ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
+
+        GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
+                           "GPU device context should be initialized to use GPU update.");
+
+        integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
+                                                          stateGpu->getUpdateStream(),
+                                                          stateGpu->xUpdatedOnDevice());
 
         integrator->setPbc(PbcType::Xyz, state->box);
     }
@@ -866,7 +871,10 @@ void gmx::LegacySimulator::do_md()
                             Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
                     void* streamNonLocal = Nbnxm::gpu_get_command_stream(
                             fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-                    constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal);
+                    GMX_RELEASE_ASSERT(
+                            fr->deviceContext != nullptr,
+                            "GPU device context should be initialized to use GPU halo exchange.");
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal);
                 }
             }
         }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index b233b0737c..081501bfff 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -73,6 +73,7 @@
 #include "gromacs/fileio/tpxio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/detecthardware.h"
@@ -1140,9 +1141,20 @@ int Mdrunner::mdrunner()
             EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
 
     // Get the device handles for the modules, nullptr when no task is assigned.
+    // TODO: There should be only one DeviceInformation.
     DeviceInformation* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
     DeviceInformation* pmeDeviceInfo       = gpuTaskAssignments.initPmeDevice();
 
+    std::unique_ptr<DeviceContext> deviceContext = nullptr;
+    if (pmeDeviceInfo)
+    {
+        deviceContext = std::make_unique<DeviceContext>(*pmeDeviceInfo);
+    }
+    else if (nonbondedDeviceInfo)
+    {
+        deviceContext = std::make_unique<DeviceContext>(*nonbondedDeviceInfo);
+    }
+
     // TODO Initialize GPU streams here.
 
     // TODO Currently this is always built, yet DD partition code
@@ -1338,13 +1350,19 @@ int Mdrunner::mdrunner()
                       opt2fn("-tablep", filenames.size(), filenames.data()),
                       opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
 
+        fr->deviceContext = deviceContext.get();
+
         if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
         {
-            fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim, cr->dd->pme_nodeid);
+            GMX_RELEASE_ASSERT(
+                    deviceContext != nullptr,
+                    "Device context can not be nullptr when PME-PP direct communications object.");
+            fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
+                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, *deviceContext);
         }
 
         fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, nonbondedDeviceInfo,
-                                        &mtop, box, wcycle);
+                                        fr->deviceContext, &mtop, box, wcycle);
         if (useGpuForBonded)
         {
             auto stream = havePPDomainDecomposition(cr)
@@ -1352,7 +1370,10 @@ int Mdrunner::mdrunner()
                                             fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
                                   : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
                                                                   gmx::InteractionLocality::Local);
-            gpuBonded     = std::make_unique<GpuBonded>(mtop.ffparams, stream, wcycle);
+            GMX_RELEASE_ASSERT(
+                    fr->deviceContext != nullptr,
+                    "Device context can not be nullptr when computing bonded interactions on GPU.");
+            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, stream, wcycle);
             fr->gpuBonded = gpuBonded.get();
         }
 
@@ -1428,7 +1449,13 @@ int Mdrunner::mdrunner()
     PmeGpuProgramStorage pmeGpuProgram;
     if (thisRankHasPmeGpuTask)
     {
-        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+        GMX_RELEASE_ASSERT(
+                pmeDeviceInfo != nullptr,
+                "Device information can not be nullptr when building PME GPU program object.");
+        GMX_RELEASE_ASSERT(
+                deviceContext != nullptr,
+                "Device context can not be nullptr when building PME GPU program object.");
+        pmeGpuProgram = buildPmeGpuProgram(*pmeDeviceInfo, *deviceContext);
     }
 
     /* Initiate PME if necessary,
@@ -1566,14 +1593,16 @@ int Mdrunner::mdrunner()
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                             : nullptr;
-            const DeviceContext& deviceContext = *pme_gpu_get_device_context(fr->pmedata);
-            const int            paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
+            const int          paddingSize = pme_gpu_get_padding_size(fr->pmedata);
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
-
+            GMX_RELEASE_ASSERT(
+                    deviceContext != nullptr,
+                    "Device context can not be nullptr when building GPU propagator data object.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle);
+                    pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
+                    paddingSize, wcycle);
             fr->stateGpu = stateGpu.get();
         }
 
@@ -1608,7 +1637,8 @@ int Mdrunner::mdrunner()
         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
         /* do PME only */
         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
-        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode,
+                    deviceContext.get());
     }
 
     wallcycle_stop(wcycle, ewcRUN);
@@ -1670,6 +1700,7 @@ int Mdrunner::mdrunner()
 
     free_gpu(nonbondedDeviceInfo);
     free_gpu(pmeDeviceInfo);
+    deviceContext.reset(nullptr);
     sfree(fcd);
 
     if (doMembed)
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index 8c4f5d2f01..d53b5e571c 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -52,6 +52,7 @@
 struct gmx_pme_t;
 struct nonbonded_verlet_t;
 struct bonded_threading_t;
+class DeviceContext;
 class DispersionCorrection;
 struct t_forcetable;
 struct t_QMMMrec;
@@ -289,6 +290,9 @@ struct t_forcerec
     //       general StatePropagatorData object that is passed around
     gmx::StatePropagatorDataGpu* stateGpu = nullptr;
 
+    //! GPU device context
+    DeviceContext* deviceContext = nullptr;
+
     /* For PME-PP GPU communication */
     std::unique_ptr<gmx::PmePpCommGpu> pmePpCommGpu;
 };
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 7467f95b69..666aefc629 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -413,7 +413,8 @@ static void cuda_init_const(NbnxmGpu*                       nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
+NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
+                   const DeviceContext& /* deviceContext */,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index 9eac3f7c78..822852786c 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -50,6 +50,8 @@
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/mdtypes/locality.h"
 
+class DeviceContext;
+
 struct NbnxmGpu;
 struct gmx_gpu_info_t;
 struct DeviceInformation;
@@ -65,6 +67,7 @@ namespace Nbnxm
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
 NbnxmGpu* gpu_init(const DeviceInformation gmx_unused* deviceInfo,
+                   const DeviceContext gmx_unused& deviceContext,
                    const interaction_const_t gmx_unused* ic,
                    const PairlistParams gmx_unused& listParams,
                    const nbnxn_atomdata_t gmx_unused* nbat,
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 2fa353a848..a15f646ed9 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -120,6 +120,7 @@
 #include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/real.h"
 
+class DeviceContext;
 struct DeviceInformation;
 struct gmx_domdec_zones_t;
 struct gmx_enerdata_t;
@@ -409,6 +410,7 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
                                                    const t_commrec*         cr,
                                                    const gmx_hw_info_t&     hardwareInfo,
                                                    const DeviceInformation* deviceInfo,
+                                                   const DeviceContext*     deviceContext,
                                                    const gmx_mtop_t*        mtop,
                                                    matrix                   box,
                                                    gmx_wallcycle*           wcycle);
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp
index 58fee75e6a..f7c7f6dd16 100644
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -364,6 +364,7 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
                                                    const t_commrec*         cr,
                                                    const gmx_hw_info_t&     hardwareInfo,
                                                    const DeviceInformation* deviceInfo,
+                                                   const DeviceContext*     deviceContext,
                                                    const gmx_mtop_t*        mtop,
                                                    matrix                   box,
                                                    gmx_wallcycle*           wcycle)
@@ -445,9 +446,13 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
     int       minimumIlistCountForGpuBalancing = 0;
     if (useGpu)
     {
+        GMX_RELEASE_ASSERT(
+                deviceContext != nullptr,
+                "Device context can not be nullptr when to use GPU for non-bonded forces.");
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
-        gpu_nbv = gpu_init(deviceInfo, fr->ic, pairlistParams, nbat.get(), cr->nodeid, haveMultipleDomains);
+        gpu_nbv = gpu_init(deviceInfo, *deviceContext, fr->ic, pairlistParams, nbat.get(),
+                           cr->nodeid, haveMultipleDomains);
 
         minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
     }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index 5b5911941d..eb1234d512 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -138,7 +138,7 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables&     tabl
        CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error);
      */
 
-    coul_tab = clCreateBuffer(runData->deviceContext.context(),
+    coul_tab = clCreateBuffer(runData->deviceContext_.context(),
                               CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                               tables.tableF.size() * sizeof(cl_float),
                               const_cast<real*>(tables.tableF.data()), &cl_error);
@@ -160,23 +160,23 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, gmx_device_runtim
     ad->ntypes = ntypes;
 
     ad->shift_vec =
-            clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+            clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                            SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
     ad->bShiftVecUploaded = CL_FALSE;
 
-    ad->fshift = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->fshift = clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                 SHIFTS * sizeof(nb_staging_t::fshift[0]), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-    ad->e_lj = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_lj = clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                               sizeof(float), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-    ad->e_el = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_el = clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                               sizeof(float), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -336,7 +336,7 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
            CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);
          */
 
-        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
+        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_ONLY,
                                                   sizeof(cl_float), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -354,12 +354,12 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
            array_format.image_channel_data_type = CL_FLOAT;
            array_format.image_channel_order     = CL_R;
 
-           nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_ONLY |
+           nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext_.context(), CL_MEM_READ_ONLY |
            CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
          */
 
         nbp->nbfp_climg2d = clCreateBuffer(
-                runData->deviceContext.context(),
+                runData->deviceContext_.context(),
                 CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                 nnbfp * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp.data()), &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -372,7 +372,7 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
             /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
                CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
             nbp->nbfp_comb_climg2d =
-                    clCreateBuffer(runData->deviceContext.context(),
+                    clCreateBuffer(runData->deviceContext_.context(),
                                    CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                                    nnbfp_comb * sizeof(cl_float),
                                    const_cast<float*>(nbatParams.nbfp_comb.data()), &cl_error);
@@ -388,7 +388,7 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
             // TODO: decide which alternative is most efficient - textures or buffers.
             /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(),
                CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
+            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext_.context(), CL_MEM_READ_ONLY,
                                                     sizeof(cl_float), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -556,6 +556,7 @@ static void nbnxn_ocl_init_const(NbnxmGpu*                       nb,
 
 //! This function is documented in the header file
 NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
+                   const DeviceContext&       deviceContext,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
@@ -583,7 +584,7 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
 
     /* set device info, just point it to the right GPU among the detected ones */
     nb->deviceInfo  = deviceInfo;
-    nb->dev_rundata = new gmx_device_runtime_data_t();
+    nb->dev_rundata = new gmx_device_runtime_data_t(deviceContext);
 
     /* init nbst */
     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
@@ -605,11 +606,9 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
         queue_properties = 0;
     }
 
-    nb->dev_rundata->deviceContext.init(*deviceInfo);
-
     /* local/non-local GPU streams */
     nb->stream[InteractionLocality::Local] =
-            clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+            clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                  nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
     if (CL_SUCCESS != cl_error)
     {
@@ -622,7 +621,7 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
         nb->stream[InteractionLocality::NonLocal] =
-                clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+                clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
                                      nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
         if (CL_SUCCESS != cl_error)
         {
@@ -736,7 +735,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     }
 
     // TODO most of this function is same in CUDA and OpenCL, move into the header
-    const DeviceContext& deviceContext = nb->dev_rundata->deviceContext;
+    const DeviceContext& deviceContext = nb->dev_rundata->deviceContext_;
 
     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                            deviceContext);
@@ -815,13 +814,13 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
             freeDeviceBuffer(&d_atdat->atom_types);
         }
 
-        d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+        d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
                                     CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                     nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-        d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+        d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
                                      CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -829,7 +828,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
         if (useLjCombRule(nb->nbparam->vdwtype))
         {
-            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
                                               CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -837,7 +836,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         }
         else
         {
-            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
                                                  CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                                  nalloc * sizeof(int), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
index 0ba3345780..9c1c759880 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
@@ -200,10 +200,10 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb)
         {
             /* TODO when we have a proper MPI-aware logging module,
                the log output here should be written there */
-            program =
-                    gmx::ocl::compileProgram(stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl",
-                                             extraDefines, nb->dev_rundata->deviceContext.context(),
-                                             nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
+            program = gmx::ocl::compileProgram(
+                    stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
+                    nb->dev_rundata->deviceContext_.context(), nb->deviceInfo->oclDeviceId,
+                    nb->deviceInfo->deviceVendor);
         }
         catch (gmx::GromacsException& e)
         {
-- 
2.22.0