From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 29 Jan 2020 15:59:33 +0000 (+0100)
Subject: Make DeviceContext into a proper class
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=345378d48c1bcc8fb1707d7a3987b37189ee7d83;p=alexxy%2Fgromacs.git

Make DeviceContext into a proper class

Having DeviceContext as a platform-agnostic class allows one
to easily create context, which is usefull for setting up the
GPU stream/context manager, tests for GPU functionality.

Refs. #3315.

Change-Id: I496d225b58c96d7642830c8e9552139750891849
---

diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 2ac4228d6f..92a1d9f3d5 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -147,9 +147,9 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
     // reallocate on device only if needed
     if (newSize > maxPackedBufferSize_)
     {
-        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
-        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
-        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
+        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
+        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
         maxPackedBufferSize_ = newSize;
     }
 
@@ -448,7 +448,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
 
     changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
 
-    allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+    allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
 }
 
 GpuHaloExchange::Impl::~Impl()
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index b139a9b491..a8d2f9204c 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -47,6 +47,7 @@
 #define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
 
 #include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/utility/gmxmpi.h"
@@ -175,6 +176,8 @@ private:
     GpuEventSynchronizer* haloDataTransferLaunched_ = nullptr;
     //! MPI communicator used for simulation
     MPI_Comm mpi_comm_mysim_;
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
     //! CUDA stream for local non-bonded calculations
     cudaStream_t localStream_ = nullptr;
     //! CUDA stream for non-local non-bonded calculations
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index ef89a5bb79..40a34682c0 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -71,6 +71,7 @@ struct gmx_pme_t;
 struct gmx_wallcycle;
 struct NumPmeDomains;
 
+class DeviceContext;
 enum class GpuTaskCompletion;
 class PmeGpuProgram;
 class GpuEventSynchronizer;
@@ -439,7 +440,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARG
  *  \param[in] pme            The PME data structure.
  *  \returns                  Pointer to GPU context object.
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 7f092304b5..b4cec47135 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -442,12 +442,11 @@ void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
     return pme_gpu_get_stream(pme->gpu);
 }
 
-void* pme_gpu_get_device_context(const gmx_pme_t* pme)
+const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* pme)
 {
-    if (!pme || !pme_gpu_active(pme))
-    {
-        return nullptr;
-    }
+    GMX_RELEASE_ASSERT(pme, "GPU context requested from PME before PME was constructed.");
+    GMX_RELEASE_ASSERT(pme_gpu_active(pme),
+                       "GPU context requested from PME, but PME is running on the CPU.");
     return pme_gpu_get_context(pme->gpu);
 }
 
diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
index 2b14dc7567..c6e1b6448c 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
@@ -80,7 +80,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                            == kernelParamsPtr->grid.complexGridSize[i],
                    "Complex padding not implemented");
     }
-    cl_context context = pmeGpu->archSpecific->context;
+    cl_context context = pmeGpu->archSpecific->deviceContext_.context();
     commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
     realGrid_                       = kernelParamsPtr->grid.d_realGrid;
     complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid;
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index f9932eb36b..17dd6805c0 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -142,7 +142,7 @@ void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
 {
     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
     allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
 }
 
@@ -172,7 +172,8 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
                            &pmeGpu->archSpecific->splineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->splineValuesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
     if (shouldRealloc)
     {
         /* Reallocate the host buffer */
@@ -202,8 +203,8 @@ void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
-                           &pmeGpu->archSpecific->forcesSize,
-                           &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
     pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
     pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
 }
@@ -238,7 +239,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
                            &pmeGpu->archSpecific->coefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->coefficientsSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
@@ -270,11 +272,11 @@ void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
-                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, &currentSizeTemp,
+                           &currentSizeTempAlloc, pmeGpu->archSpecific->deviceContext_);
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
-                           &pmeGpu->archSpecific->splineDataSize,
-                           &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
     // the host side reallocation
     if (shouldRealloc)
     {
@@ -300,7 +302,8 @@ void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu)
     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
                            &pmeGpu->archSpecific->gridlineIndicesSize,
-                           &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->gridlineIndicesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
     pfree(pmeGpu->staging.h_gridlineIndices);
     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
 }
@@ -326,10 +329,11 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
         /* 2 separate grids */
         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
                                &pmeGpu->archSpecific->complexGridSize,
-                               &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
-        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
-                               &pmeGpu->archSpecific->realGridSize,
-                               &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+                               &pmeGpu->archSpecific->complexGridSizeAlloc,
+                               pmeGpu->archSpecific->deviceContext_);
+        reallocateDeviceBuffer(
+                &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize,
+                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
     }
     else
     {
@@ -337,7 +341,7 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
         reallocateDeviceBuffer(
                 &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
-                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
         // the size might get used later for copying the grid
@@ -385,9 +389,9 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
 #elif GMX_GPU == GMX_GPU_OPENCL
     // No dedicated texture routines....
     allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
     allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
     copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
                        newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
                        GpuApiCallBehavior::Async, nullptr);
@@ -486,7 +490,7 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
 #endif
 
     /* Allocate the target-specific structures */
-    pmeGpu->archSpecific.reset(new PmeGpuSpecific());
+    pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
 
     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
@@ -495,9 +499,6 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
      */
 
-    // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
-    pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
-
     // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
     if (GMX_GPU == GMX_GPU_CUDA)
     {
@@ -537,8 +538,8 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
             pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
     cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
     cl_int       clError;
-    pmeGpu->archSpecific->pmeStream =
-            clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
+    pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
+            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
     if (clError != CL_SUCCESS)
     {
         GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
@@ -1526,16 +1527,12 @@ void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
     }
 }
 
-void* pme_gpu_get_context(const PmeGpu* pmeGpu)
+const DeviceContext* pme_gpu_get_context(const PmeGpu* pmeGpu)
 {
-    if (pmeGpu)
-    {
-        return static_cast<void*>(&pmeGpu->archSpecific->context);
-    }
-    else
-    {
-        return nullptr;
-    }
+    GMX_RELEASE_ASSERT(
+            pmeGpu,
+            "GPU context object was requested, but PME GPU object was not (yet) initialized.");
+    return &pmeGpu->archSpecific->deviceContext_;
 }
 
 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index 0c5b53ab0e..a9dc9677ce 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -412,7 +412,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeG
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to context object.
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h
index 2c79f20a71..0b007c3966 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -44,6 +44,7 @@
 
 #include "config.h"
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/utility/classhelpers.h"
 
@@ -77,7 +78,7 @@ struct PmeGpuProgramImpl
      * TODO: Later we want to be able to own the context at a higher level and not here,
      * but this class would still need the non-owning context handle to build the kernels.
      */
-    DeviceContext context;
+    DeviceContext deviceContext_;
 
     //! Conveniently all the PME kernels use the same single argument type
 #if GMX_GPU == GMX_GPU_CUDA
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
index ec220c7c2a..1353a99ed2 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -64,7 +64,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo)
     contextProperties[2] = 0; /* Terminates the list of properties */
 
     cl_int clError;
-    context = clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError);
+    deviceContext_.setContext(clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError));
     if (clError != CL_SUCCESS)
     {
         const std::string errorString = gmx::formatString(
@@ -74,7 +74,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo)
     }
 
     // kernel parameters
-    warpSize = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceId);
     // TODO: for Intel ideally we'd want to set these based on the compiler warp size
     // but given that we've done no tuning for Intel iGPU, this is as good as anything.
     spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo->maxWorkGroupSize);
@@ -96,7 +96,6 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
     stat |= clReleaseKernel(solveXYZEnergyKernel);
     stat |= clReleaseKernel(solveYZXKernel);
     stat |= clReleaseKernel(solveYZXEnergyKernel);
-    stat |= clReleaseContext(context);
     GMX_ASSERT(stat == CL_SUCCESS,
                gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat,
                                  ocl_get_error_string(stat).c_str())
@@ -165,8 +164,8 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation* deviceInfo)
             /* TODO when we have a proper MPI-aware logging module,
                the log output here should be written there */
             program = gmx::ocl::compileProgram(stderr, "gromacs/ewald", "pme_program.cl",
-                                               commonDefines, context, deviceInfo->oclDeviceId,
-                                               deviceInfo->deviceVendor);
+                                               commonDefines, deviceContext_.context(),
+                                               deviceInfo->oclDeviceId, deviceInfo->deviceVendor);
         }
         catch (gmx::GromacsException& e)
         {
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index be4f986957..44ca3fd3c3 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -60,6 +60,8 @@
 
 #include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT
 
+#include "pme_gpu_3dfft.h"
+
 class GpuParallel3dFft;
 
 /*! \internal \brief
@@ -67,6 +69,11 @@ class GpuParallel3dFft;
  */
 struct PmeGpuSpecific
 {
+    /*! \brief Constructor
+     *
+     * \param[in] deviceContext GPU device context.
+     */
+    PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
     /*! \brief The GPU stream where everything related to the PME happens. */
     CommandStream pmeStream;
 
@@ -76,7 +83,7 @@ struct PmeGpuSpecific
      * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram,
      * managed by high-level code.
      */
-    DeviceContext context;
+    const DeviceContext& deviceContext_;
 
     /* Synchronization events */
     /*! \brief Triggered after the PME Force Calculations have been completed */
@@ -86,13 +93,13 @@ struct PmeGpuSpecific
 
     /* Settings which are set at the start of the run */
     /*! \brief A boolean which tells whether the complex and real grids for cu/clFFT are different or same. Currenty true. */
-    bool performOutOfPlaceFFT;
+    bool performOutOfPlaceFFT = false;
     /*! \brief A boolean which tells if the GPU timing events are enabled.
      *  False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING.
      *  Note: will not be reliable when multiple GPU tasks are running concurrently on the same
      * device context, as CUDA events on multiple streams are untrustworthy.
      */
-    bool useTiming;
+    bool useTiming = false;
 
     //! Vector of FFT setups
     std::vector<std::unique_ptr<GpuParallel3dFft>> fftSetup;
@@ -112,37 +119,37 @@ struct PmeGpuSpecific
      * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
      */
     /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/
-    int coordinatesSize;
+    int coordinatesSize = 0;
     /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */
-    int coordinatesSizeAlloc;
+    int coordinatesSizeAlloc = 0;
     /*! \brief The kernelParams.atoms.forces float element count (actual) */
-    int forcesSize;
+    int forcesSize = 0;
     /*! \brief The kernelParams.atoms.forces float element count (reserved) */
-    int forcesSizeAlloc;
+    int forcesSizeAlloc = 0;
     /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */
-    int gridlineIndicesSize;
+    int gridlineIndicesSize = 0;
     /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */
-    int gridlineIndicesSizeAlloc;
+    int gridlineIndicesSizeAlloc = 0;
     /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */
-    int splineDataSize;
+    int splineDataSize = 0;
     /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
-    int splineDataSizeAlloc;
+    int splineDataSizeAlloc = 0;
     /*! \brief The kernelParams.atoms.coefficients float element count (actual) */
-    int coefficientsSize;
+    int coefficientsSize = 0;
     /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
-    int coefficientsSizeAlloc;
+    int coefficientsSizeAlloc = 0;
     /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
-    int splineValuesSize;
+    int splineValuesSize = 0;
     /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
-    int splineValuesSizeAlloc;
+    int splineValuesSizeAlloc = 0;
     /*! \brief The kernelParams.grid.realGrid float element count (actual) */
-    int realGridSize;
+    int realGridSize = 0;
     /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
-    int realGridSizeAlloc;
+    int realGridSizeAlloc = 0;
     /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
-    int complexGridSize;
+    int complexGridSize = 0;
     /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
-    int complexGridSizeAlloc;
+    int complexGridSizeAlloc = 0;
 };
 
 #endif
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index eed93cd2e3..2ee17b3267 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -628,8 +628,8 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
     const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
     if (useGpuForPme)
     {
-        const void* commandStream = pme_gpu_get_device_stream(pme);
-        const void* deviceContext = pme_gpu_get_device_context(pme);
+        const void*          commandStream = pme_gpu_get_device_stream(pme);
+        const DeviceContext& deviceContext = *pme_gpu_get_device_context(pme);
 
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 827a1bab34..29cb73d0ca 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -73,7 +73,8 @@ void PmePpCommGpu::Impl::reinit(int size)
     MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
 
     // Reallocate buffer used for staging PME force on GPU
-    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_,
+                           DeviceContext());
 #else
     GMX_UNUSED_VALUE(size);
 #endif
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index c5b93d94c2..81edf195fe 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -166,7 +166,7 @@ std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme
     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
     //       restrict one from using other constructor here.
     return std::make_unique<StatePropagatorDataGpu>(
-            pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme),
+            pme_gpu_get_device_stream(&pme), *pme_gpu_get_device_context(&pme),
             GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr);
 }
 
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt
index 942141d288..8672e450ca 100644
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@ gmx_add_libgromacs_sources(
         )
 if(GMX_USE_OPENCL)
     gmx_add_libgromacs_sources(
+        device_context_ocl.cpp
         gpu_utils_ocl.cpp
         ocl_compiler.cpp
         ocl_caching.cpp
diff --git a/src/gromacs/gpu_utils/device_context.h b/src/gromacs/gpu_utils/device_context.h
new file mode 100644
index 0000000000..d192b5543f
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context.h
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+#    include "gromacs/gpu_utils/device_context_ocl.h"
+#else
+#    include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// Stub for device context
+class DeviceContext
+{
+public:
+    //! Default constructor. In OpenCL leaves context \c nullptr.
+    DeviceContext() {}
+    /*! \brief Second stage of construction. Creates the \c cl_context in OpenCL, does nothing in CUDA.
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     */
+    void init(const DeviceInformation& /*deviceInfo*/) {}
+    /*! \brief Construct the object and call \c init(...) .
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     */
+    DeviceContext(const DeviceInformation& deviceInfo) { init(deviceInfo); }
+    //! Destructor
+    ~DeviceContext() = default;
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+};
+#endif // GMX_GPU != GMX_GPU_OPENCL
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_H
diff --git a/src/gromacs/gpu_utils/device_context_ocl.cpp b/src/gromacs/gpu_utils/device_context_ocl.cpp
new file mode 100644
index 0000000000..e5dfd5939d
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context_ocl.cpp
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_context_ocl.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Copies of values from cl_driver_diagnostics_intel.h,
+ * which isn't guaranteed to be available. */
+/**@{*/
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
+/**@}*/
+
+DeviceContext::DeviceContext()
+{
+    context_ = nullptr;
+}
+
+void DeviceContext::init(const DeviceInformation& deviceInfo)
+{
+    cl_platform_id                     platformId = deviceInfo.oclPlatformId;
+    cl_device_id                       deviceId   = deviceInfo.oclDeviceId;
+    std::vector<cl_context_properties> contextProperties;
+
+    contextProperties.emplace_back(CL_CONTEXT_PLATFORM);
+    contextProperties.emplace_back(reinterpret_cast<cl_context_properties>(platformId));
+
+    if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
+    {
+        contextProperties.emplace_back(CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL);
+        contextProperties.emplace_back(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL
+                                       | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL);
+    }
+    contextProperties.emplace_back(0);
+
+    cl_int clError;
+    context_ = clCreateContext(contextProperties.data(), 1, &deviceId, nullptr, nullptr, &clError);
+    if (clError != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError(gmx::formatString(
+                "Failed to create OpenCL context on device %s (OpenCL error ID %d).",
+                deviceInfo.device_name, clError)));
+    }
+}
+
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
+{
+    init(deviceInfo);
+}
+
+DeviceContext::~DeviceContext()
+{
+    cl_int clError;
+
+    if (context_)
+    {
+        clError = clReleaseContext(context_);
+        GMX_RELEASE_ASSERT(
+                clError == CL_SUCCESS,
+                gmx::formatString("Failed to release OpenCL context (OpenCL error ID %d).", clError).c_str());
+        context_ = nullptr;
+    }
+}
+
+cl_context DeviceContext::context() const
+{
+    return context_;
+}
+
+void DeviceContext::setContext(cl_context context)
+{
+    context_ = context;
+}
diff --git a/src/gromacs/gpu_utils/device_context_ocl.h b/src/gromacs/gpu_utils/device_context_ocl.h
new file mode 100644
index 0000000000..58bb75595e
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context_ocl.h
@@ -0,0 +1,92 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// OpenCL device context class
+class DeviceContext
+{
+public:
+    //! Default constructor. Sets \c context_ to \c nullptr.
+    DeviceContext();
+    /*! \brief Second stage of construction. Creates the \c cl_context.
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     *
+     * \throws InternalError if context creation failed.
+     */
+    void init(const DeviceInformation& deviceInfo);
+    /*! \brief Construct the object and call \c init(...) .
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     *
+     * \throws InternalError if context creation failed.
+     */
+    DeviceContext(const DeviceInformation& deviceInfo);
+    //! Destructor
+    ~DeviceContext();
+
+    //! Getter
+    cl_context context() const;
+
+    //! Transition time setter - will be removed
+    void setContext(cl_context context);
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+
+private:
+    //! OpenCL context object
+    cl_context context_ = nullptr;
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index 3a1bd3d195..59255bfa93 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -45,6 +45,7 @@
  *  \inlibraryapi
  */
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
 #include "gromacs/gpu_utils/gputraits.cuh"
@@ -60,7 +61,7 @@
  * \param[in]     deviceContext        The buffer's dummy device  context - not managed explicitly in CUDA RT.
  */
 template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext /* deviceContext */)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
diff --git a/src/gromacs/gpu_utils/devicebuffer.h b/src/gromacs/gpu_utils/devicebuffer.h
index 4c28d49613..c0cdfec329 100644
--- a/src/gromacs/gpu_utils/devicebuffer.h
+++ b/src/gromacs/gpu_utils/devicebuffer.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -80,7 +80,7 @@ void reallocateDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                             size_t                   numValues,
                             int*                     currentNumValues,
                             int*                     currentMaxNumValues,
-                            DeviceContext            deviceContext)
+                            const DeviceContext&     deviceContext)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     GMX_ASSERT(currentNumValues, "needs a size pointer");
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index 40f1e12941..ee1adc1cce 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -45,6 +45,7 @@
  *  \inlibraryapi
  */
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
 #include "gromacs/gpu_utils/gputraits_ocl.h"
@@ -62,13 +63,13 @@
  * \param[in]     deviceContext        The buffer's device context-to-be.
  */
 template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext deviceContext)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& deviceContext)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     void*  hostPtr = nullptr;
     cl_int clError;
-    *buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType),
-                             hostPtr, &clError);
+    *buffer = clCreateBuffer(deviceContext.context(), CL_MEM_READ_WRITE,
+                             numValues * sizeof(ValueType), hostPtr, &clError);
     GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
                        gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError,
                                          ocl_get_error_string(clError).c_str())
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh
index 8a4936dabc..b477cdcb4c 100644
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -37,7 +37,9 @@
 
 /*! \libinternal \file
  *  \brief Declares the CUDA type traits.
+ *
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
  *
  * \inlibraryapi
  * \ingroup module_gpu_utils
@@ -63,8 +65,6 @@ struct DeviceInformation
 using CommandStream = cudaStream_t;
 //! \brief Single GPU call timing event - meaningless in CUDA
 using CommandEvent = void;
-//! \brief Context used explicitly in OpenCL, does nothing in CUDA
-using DeviceContext = void*;
 
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h
index 0229ea443c..a36a5cc3bc 100644
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -36,8 +36,10 @@
 #define GMX_GPU_UTILS_GPUTRAITS_H
 
 /*! \libinternal \file
- *  \brief Declares the GPU type traits for non-GPU builds
+ *  \brief Declares the GPU type traits for non-GPU builds.
+ *
  *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
  *
  * \inlibraryapi
  * \ingroup module_gpu_utils
@@ -65,8 +67,6 @@ struct DeviceInformation
 using CommandStream = void*;
 //! \brief Single GPU call timing event
 using CommandEvent = void*;
-//! \brief GPU context
-using DeviceContext = void*;
 
 #endif // GMX_GPU
 
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h
index 00d9cba90d..caf837552a 100644
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -37,7 +37,9 @@
 
 /*! \libinternal \file
  *  \brief Declares the OpenCL type traits.
+ *
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
  *
  * \inlibraryapi
  * \ingroup module_gpu_utils
@@ -81,8 +83,6 @@ struct DeviceInformation
 using CommandStream = cl_command_queue;
 //! \brief Single GPU call timing event
 using CommandEvent = cl_event;
-//! \brief Context used explicitly in OpenCL
-using DeviceContext = cl_context;
 
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 91b6059d27..230b3ff94e 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -44,6 +44,7 @@
 
 #include <string>
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gmxopencl.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/utility/exceptions.h"
@@ -63,8 +64,10 @@ enum class GpuApiCallBehavior;
  */
 struct gmx_device_runtime_data_t
 {
-    cl_context context; /**< OpenCL context */
-    cl_program program; /**< OpenCL program */
+    //! OpenCL context
+    DeviceContext deviceContext;
+    //! OpenCL program
+    cl_program program;
 };
 
 /*! \brief Launches synchronous or asynchronous device to host memory copy.
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
index 221959fe79..e16dd8ebf5 100644
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -110,15 +110,18 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer<float3> g
 
 void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std::vector<gmx::RVec>& h_rVecInput)
 {
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
     const int numElements = h_rVecInput.size();
 
     DeviceBuffer<RVec> d_rVecInput;
-    allocateDeviceBuffer(&d_rVecInput, numElements, nullptr);
+    allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
     copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
                        GpuApiCallBehavior::Sync, nullptr);
 
     DeviceBuffer<float3> d_float3Output;
-    allocateDeviceBuffer(&d_float3Output, numElements * DIM, nullptr);
+    allocateDeviceBuffer(&d_float3Output, numElements * DIM, deviceContext);
 
     std::vector<float3> h_float3Output(numElements);
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index bc10e76066..ff7092f40c 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -68,14 +68,14 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallc
     stream_ = *static_cast<CommandStream*>(streamPtr);
     wcycle_ = wcycle;
 
-    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
+    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
     // This could be an async transfer (if the source is pinned), so
     // long as it uses the same stream as the kernels and we are happy
     // to consume additional pinned pages.
     copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
                        GpuApiCallBehavior::Sync, nullptr);
     vTot_.resize(F_NRE);
-    allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+    allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
 
     kernelParams_.d_forceParams = d_forceParams_;
@@ -204,7 +204,8 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
         {
             t_ilist& d_iList = d_iLists_[fType];
 
-            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
+            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
+                                   deviceContext_);
 
             copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
                                GpuApiCallBehavior::Async, nullptr);
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h
index a785b16dbb..0532b40315 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -48,6 +48,7 @@
 #ifndef GMX_LISTED_FORCES_GPUBONDED_IMPL_H
 #define GMX_LISTED_FORCES_GPUBONDED_IMPL_H
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/listed_forces/gpubonded.h"
@@ -179,6 +180,8 @@ private:
     //! \brief Device-side total virial
     float* d_vTot_ = nullptr;
 
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
     //! \brief Bonded GPU stream, not owned by this module
     CommandStream stream_;
 
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu
index 61bc231e36..b77162c1af 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -316,7 +316,9 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
     return;
 }
 
-LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream)
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+    deviceContext_(deviceContext),
+    commandStream_(commandStream)
 {
     numAtoms_ = 0;
 
@@ -342,7 +344,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
     numTempScaleValues_ = numTempScaleValues;
 
     reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
-                           &numInverseMassesAlloc_, nullptr);
+                           &numInverseMassesAlloc_, deviceContext_);
     copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 
@@ -350,7 +352,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
     if (numTempScaleValues > 1)
     {
         reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
-                               &numTempScaleGroupsAlloc_, nullptr);
+                               &numTempScaleGroupsAlloc_, deviceContext_);
         copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
                            GpuApiCallBehavior::Sync, nullptr);
     }
@@ -359,7 +361,8 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
     if (numTempScaleValues_ > 0)
     {
         h_lambdas_.resize(numTempScaleValues);
-        reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_,
+                               deviceContext_);
     }
 }
 
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh
index ba6d1be3f2..98703c05b9 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cuh
+++ b/src/gromacs/mdlib/leapfrog_gpu.cuh
@@ -44,6 +44,7 @@
 #ifndef GMX_MDLIB_LEAPFROG_GPU_CUH
 #define GMX_MDLIB_LEAPFROG_GPU_CUH
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/mdtypes/group.h"
@@ -62,9 +63,10 @@ class LeapFrogGpu
 public:
     /*! \brief Constructor.
      *
+     * \param[in] deviceContext  Device context (dummy in CUDA).
      * \param[in] commandStream  Device command stream to use.
      */
-    LeapFrogGpu(CommandStream commandStream);
+    LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
     ~LeapFrogGpu();
 
     /*! \brief Integrate
@@ -110,6 +112,8 @@ public:
     class Impl;
 
 private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
     //! GPU kernel launch config
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu
index edf3e9c58a..03c1bd1d15 100644
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -507,7 +507,11 @@ void LincsGpu::apply(const float3* d_x,
     return;
 }
 
-LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream) :
+LincsGpu::LincsGpu(int                  numIterations,
+                   int                  expansionOrder,
+                   const DeviceContext& deviceContext,
+                   CommandStream        commandStream) :
+    deviceContext_(deviceContext),
     commandStream_(commandStream)
 {
     kernelParams_.numIterations  = numIterations;
@@ -519,7 +523,7 @@ LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandS
             c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
             "Number of threads per block should be a power of two in order for reduction to work.");
 
-    allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
+    allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
     h_virialScaled_.resize(6);
 
     // The data arrays should be expanded/reallocated on first call of set() function.
@@ -911,18 +915,19 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
 
         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 
-        allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads,
+                             deviceContext_);
         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
-                             kernelParams_.numConstraintsThreads, nullptr);
+                             kernelParams_.numConstraintsThreads, deviceContext_);
 
         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
-                             kernelParams_.numConstraintsThreads, nullptr);
+                             kernelParams_.numConstraintsThreads, deviceContext_);
         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
         allocateDeviceBuffer(&kernelParams_.d_massFactors,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
         allocateDeviceBuffer(&kernelParams_.d_matrixA,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
     }
 
     // (Re)allocate the memory, if the number of atoms has increased.
@@ -933,7 +938,7 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
         }
         numAtomsAlloc_ = numAtoms;
-        allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
     }
 
     // Copy data to GPU.
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh
index 0fbebcf67f..77423dc323 100644
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -44,6 +44,7 @@
 #ifndef GMX_MDLIB_LINCS_GPU_CUH
 #define GMX_MDLIB_LINCS_GPU_CUH
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/mdlib/constr.h"
 #include "gromacs/mdtypes/mdatom.h"
@@ -103,9 +104,10 @@ public:
      *
      * \param[in] numIterations    Number of iteration for the correction of the projection.
      * \param[in] expansionOrder   Order of the matrix inversion algorithm.
+     * \param[in] deviceContext    Device context (dummy in CUDA).
      * \param[in] commandStream    Device command stream.
      */
-    LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream);
+    LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
     /*! \brief Destructor.*/
     ~LincsGpu();
 
@@ -167,6 +169,8 @@ public:
     static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
 
 private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
 
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu
index d1e8f508d5..20933baf96 100644
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -485,7 +485,8 @@ void SettleGpu::apply(const float3* d_x,
     return;
 }
 
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+    deviceContext_(deviceContext),
     commandStream_(commandStream)
 {
     static_assert(sizeof(real) == sizeof(float),
@@ -586,7 +587,7 @@ SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) :
 
     initSettleParameters(&settleParameters_, mO, mH, dOH, dHH);
 
-    allocateDeviceBuffer(&d_virialScaled_, 6, nullptr);
+    allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_);
     h_virialScaled_.resize(6);
 }
 
@@ -611,7 +612,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus
     ArrayRef<const int>    iatoms    = il_settle.iatoms;
     numSettles_                      = il_settle.size() / nral1;
 
-    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_);
     h_atomIds_.resize(numSettles_);
     for (int i = 0; i < numSettles_; i++)
     {
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh
index 3816579d63..f07af017e3 100644
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -45,6 +45,7 @@
 
 #include "gmxpre.h"
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/invertmatrix.h"
@@ -200,9 +201,10 @@ public:
      * \param[in] mtop           Topology of the system to gen the masses for O and H atoms and
      *                           target O-H and H-H distances. These values are also checked for
      *                           consistency.
+     * \param[in] deviceContext  Device context (dummy in CUDA).
      * \param[in] commandStream  Device stream to use.
      */
-    SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream);
+    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
 
     ~SettleGpu();
 
@@ -250,6 +252,8 @@ public:
     void set(const InteractionDefinitions& idef, const t_mdatoms& md);
 
 private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
     //! GPU stream
     CommandStream commandStream_;
 
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu
index 322c3beec9..5c0a007ee4 100644
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -70,8 +70,11 @@ namespace test
  */
 void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
 {
-    auto lincsGpu =
-            std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder, nullptr);
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
+    auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
+                                               deviceContext, nullptr);
 
     bool    updateVelocities = true;
     int     numAtoms         = testData->numAtoms_;
@@ -81,9 +84,9 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
     PbcAiuc pbcAiuc;
     setPbcAiuc(pbc.ndim_ePBC, pbc.box, &pbcAiuc);
 
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
     copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
                        GpuApiCallBehavior::Sync, nullptr);
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
index 1c3afdd1b0..b794149ddb 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
@@ -66,6 +66,9 @@ namespace test
 
 void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
 {
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
     int numAtoms = testData->numAtoms_;
 
     float3* h_x  = reinterpret_cast<float3*>(testData->x_.data());
@@ -75,17 +78,17 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
 
     float3 *d_x, *d_xp, *d_v, *d_f;
 
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_f, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
 
     copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
 
-    auto integrator = std::make_unique<LeapFrogGpu>(nullptr);
+    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
 
     integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
 
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu
index a14b47e819..6ebc6688da 100644
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -86,7 +86,10 @@ void applySettleGpu(SettleTestData*  testData,
     // TODO: Here we should check that at least 1 suitable GPU is available
     GMX_RELEASE_ASSERT(canPerformGpuDetection(), "Can't detect CUDA-capable GPUs.");
 
-    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, nullptr);
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
+    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
 
     settleGpu->set(*testData->idef_, testData->mdatoms_);
     PbcAiuc pbcAiuc;
@@ -100,9 +103,9 @@ void applySettleGpu(SettleTestData*  testData,
     float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data()));
     float3* h_v  = (float3*)(as_rvec_array(testData->v_.data()));
 
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
     copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
     copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index c77e1924ed..a8e5a94cc6 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -175,9 +175,9 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                              : commandStream_ = nullptr;
 
 
-    integrator_ = std::make_unique<LeapFrogGpu>(commandStream_);
-    lincsGpu_   = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, commandStream_);
-    settleGpu_  = std::make_unique<SettleGpu>(mtop, commandStream_);
+    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
+    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
+    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
 
     coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
     coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
@@ -205,10 +205,10 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
 
     numAtoms_ = md.nr;
 
-    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_);
 
     reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
-                           &numInverseMassesAlloc_, nullptr);
+                           &numInverseMassesAlloc_, deviceContext_);
 
     // Integrator should also update something, but it does not even have a method yet
     integrator_->set(md, numTempScaleValues, md.cTC);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index b835c7cf02..75b6814de0 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -163,6 +163,8 @@ public:
     static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
 
 private:
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
     //! GPU stream
     CommandStream commandStream_ = nullptr;
     //! GPU kernel launch config
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 9b31070072..b233b0737c 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1566,8 +1566,8 @@ int Mdrunner::mdrunner()
                     fr->nbv->gpu_nbv != nullptr
                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                             : nullptr;
-            const void*        deviceContext = pme_gpu_get_device_context(fr->pmedata);
-            const int          paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
+            const DeviceContext& deviceContext = *pme_gpu_get_device_context(fr->pmedata);
+            const int            paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index d75cd78ea1..034e7eb604 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -59,6 +59,7 @@
 
 #include "locality.h"
 
+class DeviceContext;
 class GpuEventSynchronizer;
 struct gmx_wallcycle;
 
@@ -100,8 +101,6 @@ public:
      *
      * \todo Make a \p CommandStream visible in the CPU parts of the code so we
      *       will not have to pass a void*.
-     * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-     *       will not have to pass a void*.
      *
      *  \param[in] pmeStream       Device PME stream, nullptr allowed.
      *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -111,13 +110,13 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        localStream,
-                           const void*        nonLocalStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const void*          pmeStream,
+                           const void*          localStream,
+                           const void*          nonLocalStream,
+                           const DeviceContext& deviceContext,
+                           GpuApiCallBehavior   transferKind,
+                           int                  paddingSize,
+                           gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
@@ -135,11 +134,11 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const void*          pmeStream,
+                           const DeviceContext& deviceContext,
+                           GpuApiCallBehavior   transferKind,
+                           int                  paddingSize,
+                           gmx_wallcycle*       wcycle);
 
     //! Move constructor
     StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index 15f054eafa..1029dd220f 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -57,7 +57,7 @@ class StatePropagatorDataGpu::Impl
 StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
                                                const void* /* localStream     */,
                                                const void* /* nonLocalStream  */,
-                                               const void* /* deviceContext   */,
+                                               const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* paddingSize     */,
                                                gmx_wallcycle* /*   wcycle */) :
@@ -66,7 +66,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */
 }
 
 StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
-                                               const void* /* deviceContext   */,
+                                               const DeviceContext& /* deviceContext   */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* paddingSize     */,
                                                gmx_wallcycle* /*   wcycle */) :
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index a138bcbb62..679bf2544a 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -101,8 +101,6 @@ public:
      *
      * \todo Make a \p CommandStream visible in the CPU parts of the code so we
      *       will not have to pass a void*.
-     * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-     *       will not have to pass a void*.
      *
      *  \param[in] pmeStream       Device PME stream, nullptr allowed.
      *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -112,13 +110,13 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    Impl(const void*        pmeStream,
-         const void*        localStream,
-         const void*        nonLocalStream,
-         const void*        deviceContext,
-         GpuApiCallBehavior transferKind,
-         int                paddingSize,
-         gmx_wallcycle*     wcycle);
+    Impl(const void*          pmeStream,
+         const void*          localStream,
+         const void*          nonLocalStream,
+         const DeviceContext& deviceContext,
+         GpuApiCallBehavior   transferKind,
+         int                  paddingSize,
+         gmx_wallcycle*       wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
@@ -136,11 +134,11 @@ public:
      *  \param[in] paddingSize     Padding size for coordinates buffer.
      *  \param[in] wcycle          Wall cycle counter data.
      */
-    Impl(const void*        pmeStream,
-         const void*        deviceContext,
-         GpuApiCallBehavior transferKind,
-         int                paddingSize,
-         gmx_wallcycle*     wcycle);
+    Impl(const void*          pmeStream,
+         const DeviceContext& deviceContext,
+         GpuApiCallBehavior   transferKind,
+         int                  paddingSize,
+         gmx_wallcycle*       wcycle);
 
     ~Impl();
 
@@ -380,10 +378,8 @@ private:
     //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
     EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
 
-    /*! \brief GPU context (for OpenCL builds)
-     * \todo Make a Context class usable in CPU code
-     */
-    DeviceContext deviceContext_ = nullptr;
+    //! GPU context (for OpenCL builds)
+    const DeviceContext& deviceContext_;
     //! Default GPU calls behavior
     GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
     //! Padding size for the coordinates buffer
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 4b385a5a7b..d88f469711 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -50,6 +50,7 @@
 #        include "gromacs/gpu_utils/cudautils.cuh"
 #    endif
 #    include "gromacs/gpu_utils/devicebuffer.h"
+#    include "gromacs/gpu_utils/gputraits.h"
 #    if GMX_GPU == GMX_GPU_OPENCL
 #        include "gromacs/gpu_utils/oclutils.h"
 #    endif
@@ -64,13 +65,14 @@
 namespace gmx
 {
 
-StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
-                                   const void*        localStream,
-                                   const void*        nonLocalStream,
-                                   const void*        deviceContext,
-                                   GpuApiCallBehavior transferKind,
-                                   int                paddingSize,
-                                   gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+                                   const void*          localStream,
+                                   const void*          nonLocalStream,
+                                   const DeviceContext& deviceContext,
+                                   GpuApiCallBehavior   transferKind,
+                                   int                  paddingSize,
+                                   gmx_wallcycle*       wcycle) :
+    deviceContext_(deviceContext),
     transferKind_(transferKind),
     paddingSize_(paddingSize),
     wcycle_(wcycle)
@@ -81,13 +83,11 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
     // TODO: Refactor when the StreamManager is introduced.
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
-        GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
         GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
 
         // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_     = *static_cast<const CommandStream*>(pmeStream);
-        updateStream_  = *static_cast<const CommandStream*>(pmeStream);
-        deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+        pmeStream_    = *static_cast<const CommandStream*>(pmeStream);
+        updateStream_ = *static_cast<const CommandStream*>(pmeStream);
         GMX_UNUSED_VALUE(localStream);
         GMX_UNUSED_VALUE(nonLocalStream);
     }
@@ -113,7 +113,6 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
         stat = cudaStreamCreate(&updateStream_);
         CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
 #    endif
-        GMX_UNUSED_VALUE(deviceContext);
     }
 
     // Map the atom locality to the stream that will be used for coordinates,
@@ -132,11 +131,12 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
     fCopyStreams_[AtomLocality::All]      = updateStream_;
 }
 
-StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
-                                   const void*        deviceContext,
-                                   GpuApiCallBehavior transferKind,
-                                   int                paddingSize,
-                                   gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+                                   const DeviceContext& deviceContext,
+                                   GpuApiCallBehavior   transferKind,
+                                   int                  paddingSize,
+                                   gmx_wallcycle*       wcycle) :
+    deviceContext_(deviceContext),
     transferKind_(transferKind),
     paddingSize_(paddingSize),
     wcycle_(wcycle)
@@ -144,12 +144,6 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
     static_assert(GMX_GPU != GMX_GPU_NONE,
                   "This object should only be constructed on the GPU code-paths.");
 
-    if (GMX_GPU == GMX_GPU_OPENCL)
-    {
-        GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
-        deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
-    }
-
     GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
     pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
 
@@ -551,22 +545,22 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
 }
 
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
-                                               const void*        localStream,
-                                               const void*        nonLocalStream,
-                                               const void*        deviceContext,
-                                               GpuApiCallBehavior transferKind,
-                                               int                paddingSize,
-                                               gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+                                               const void*          localStream,
+                                               const void*          nonLocalStream,
+                                               const DeviceContext& deviceContext,
+                                               GpuApiCallBehavior   transferKind,
+                                               int                  paddingSize,
+                                               gmx_wallcycle*       wcycle) :
     impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
 {
 }
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
-                                               const void*        deviceContext,
-                                               GpuApiCallBehavior transferKind,
-                                               int                paddingSize,
-                                               gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+                                               const DeviceContext& deviceContext,
+                                               GpuApiCallBehavior   transferKind,
+                                               int                  paddingSize,
+                                               gmx_wallcycle*       wcycle) :
     impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
 {
 }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index fa290e6473..7467f95b69 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,6 +51,7 @@
 
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -537,21 +538,21 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
         iTimers.didPairlistH2D = true;
     }
 
-    DeviceContext context = nullptr;
-
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+                           DeviceContext());
     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+                           DeviceContext());
     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+                           &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext());
 
     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, context);
+                           &d_plist->excl_nalloc, DeviceContext());
     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
@@ -857,9 +858,9 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
     const int    maxNumColumns = gridSet.numColumnsMax();
 
     reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
     reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext());
 
     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
     {
@@ -873,7 +874,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
         const int* cxy_ind         = grid.cxy_ind().data();
 
         reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize,
-                               &gpu_nbv->atomIndicesSize_alloc, nullptr);
+                               &gpu_nbv->atomIndicesSize_alloc, DeviceContext());
 
         if (atomIndicesSize > 0)
         {
@@ -948,7 +949,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
 
     if (natoms_total > 0)
     {
-        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
+                               DeviceContext());
         copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
     }
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 05a270b7f0..013bd093a0 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -70,6 +70,7 @@
 
 #include "thread_mpi/atomic.h"
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/hardware/hw_info.h"
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index 59ee706c4f..5056924224 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -134,13 +134,14 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables&     tabl
        array_format.image_channel_data_type = CL_FLOAT;
        array_format.image_channel_order     = CL_R;
 
-       coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-       &array_format, tabsize, 1, 0, ftmp, &cl_error);
+       coul_tab = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
+       CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error);
      */
 
-    coul_tab = clCreateBuffer(
-            runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-            tables.tableF.size() * sizeof(cl_float), const_cast<real*>(tables.tableF.data()), &cl_error);
+    coul_tab = clCreateBuffer(runData->deviceContext.context(),
+                              CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                              tables.tableF.size() * sizeof(cl_float),
+                              const_cast<real*>(tables.tableF.data()), &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
@@ -158,23 +159,24 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, gmx_device_runtim
 
     ad->ntypes = ntypes;
 
-    ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
-                                   SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
+    ad->shift_vec =
+            clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+                           SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
     ad->bShiftVecUploaded = CL_FALSE;
 
-    ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->fshift = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                 SHIFTS * sizeof(nb_staging_t::fshift[0]), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-    ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_lj = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                               sizeof(float), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-    ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_el = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                               sizeof(float), nullptr, &cl_error);
     GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                        ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -330,11 +332,11 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
            array_format.image_channel_data_type = CL_FLOAT;
            array_format.image_channel_order     = CL_R;
 
-           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
-            &array_format, 1, 1, 0, nullptr, &cl_error);
+           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->deviceContext.context(),
+           CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);
          */
 
-        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
                                                   sizeof(cl_float), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -352,12 +354,13 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
            array_format.image_channel_data_type = CL_FLOAT;
            array_format.image_channel_order     = CL_R;
 
-           nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY |
+           nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_ONLY |
            CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
          */
 
         nbp->nbfp_climg2d = clCreateBuffer(
-                runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                runData->deviceContext.context(),
+                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                 nnbfp * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp.data()), &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -366,12 +369,13 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
         {
             /* Switched from using textures to using buffers */
             // TODO: decide which alternative is most efficient - textures or buffers.
-            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE |
+            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
                CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(
-                    runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-                    nnbfp_comb * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp_comb.data()),
-                    &cl_error);
+            nbp->nbfp_comb_climg2d =
+                    clCreateBuffer(runData->deviceContext.context(),
+                                   CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   nnbfp_comb * sizeof(cl_float),
+                                   const_cast<float*>(nbatParams.nbfp_comb.data()), &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
         }
@@ -382,9 +386,9 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
             // don't accept nullptr values for image2D parameters.
             /* Switched from using textures to using buffers */
             // TODO: decide which alternative is most efficient - textures or buffers.
-            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
-                &array_format, 1, 1, 0, nullptr, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(),
+               CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
                                                     sizeof(cl_float), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                                ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -515,7 +519,7 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData,
                   rank, deviceInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str());
     }
 
-    runtimeData->context = context;
+    runtimeData->deviceContext.setContext(context);
 }
 
 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
@@ -636,8 +640,8 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     snew(nb->timings, 1);
 
     /* set device info, just point it to the right GPU among the detected ones */
-    nb->deviceInfo = deviceInfo;
-    snew(nb->dev_rundata, 1);
+    nb->deviceInfo  = deviceInfo;
+    nb->dev_rundata = new gmx_device_runtime_data_t();
 
     /* init nbst */
     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
@@ -662,8 +666,9 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     nbnxn_gpu_create_context(nb->dev_rundata, nb->deviceInfo, rank);
 
     /* local/non-local GPU streams */
-    nb->stream[InteractionLocality::Local] = clCreateCommandQueue(
-            nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+    nb->stream[InteractionLocality::Local] =
+            clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+                                 nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
     if (CL_SUCCESS != cl_error)
     {
         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
@@ -674,8 +679,9 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        nb->stream[InteractionLocality::NonLocal] = clCreateCommandQueue(
-                nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+        nb->stream[InteractionLocality::NonLocal] =
+                clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+                                     nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
         if (CL_SUCCESS != cl_error)
         {
             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -788,21 +794,23 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     }
 
     // TODO most of this function is same in CUDA and OpenCL, move into the header
-    DeviceContext context = nb->dev_rundata->context;
+    const DeviceContext& deviceContext = nb->dev_rundata->deviceContext;
 
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+                           deviceContext);
     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+                           deviceContext);
     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+                           &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
 
     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, context);
+                           &d_plist->excl_nalloc, deviceContext);
     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
@@ -865,19 +873,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
             freeDeviceBuffer(&d_atdat->atom_types);
         }
 
-        d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+        d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+                                    CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                     nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-        d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+        d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+                                     CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
         if (useLjCombRule(nb->nbparam->vdwtype))
         {
-            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context,
+            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
                                               CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -885,7 +895,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         }
         else
         {
-            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context,
+            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
                                                  CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                                  nalloc * sizeof(int), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -969,19 +979,9 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t* runData)
         return;
     }
 
-    cl_int gmx_unused cl_error;
-
-    if (runData->context)
-    {
-        cl_error = clReleaseContext(runData->context);
-        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                           ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
-        runData->context = nullptr;
-    }
-
     if (runData->program)
     {
-        cl_error = clReleaseProgram(runData->program);
+        cl_int cl_error = clReleaseProgram(runData->program);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
         runData->program = nullptr;
@@ -1076,7 +1076,7 @@ void gpu_free(NbnxmGpu* nb)
     }
 
     free_gpu_device_runtime_data(nb->dev_rundata);
-    sfree(nb->dev_rundata);
+    delete nb->dev_rundata;
 
     /* Free timers and timings */
     delete nb->timers;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
index 8a4e217d84..0ba3345780 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
@@ -200,9 +200,10 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb)
         {
             /* TODO when we have a proper MPI-aware logging module,
                the log output here should be written there */
-            program = gmx::ocl::compileProgram(
-                    stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
-                    nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
+            program =
+                    gmx::ocl::compileProgram(stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl",
+                                             extraDefines, nb->dev_rundata->deviceContext.context(),
+                                             nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
         }
         catch (gmx::GromacsException& e)
         {