From 092a8f684682abcc4c397622cc37fd87b2f10b25 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 3 Sep 2019 14:23:40 +0200
Subject: [PATCH] StatePropagatorDataGpu object to manage GPU forces, positions
 and velocities buffers

In current version the positions and forces on the GPU are managed by different
modules, depending of the offload scenario for a particular run. This makes
management of the buffers complicated and fragile. This commit adds the object
responsible for management of the GPU buffers of coordinates, forces and
velocities. The object is connected to all clients that use coordinates, forces
and velocities buffers, while keeping the existing logic intact where its
possible.

Since the H2D and D2H copies are now done in nullptr stream, some of implicit
synchronization is lost. Consequently this commit does not always work
properly with newly introduced buffer ops / halo exchange features. To avoid
the confusion, GPU buffer ops are disabled by the assertion. There will be
a separate commit with all copies done synchronously, which will work
with the buffer ops. The stream- and event-based synchronization will be
introduced in the follow-up commits.

Refs. #2816.

Change-Id: I2e2ba1b6436f087d1f2fef4ff876445814a724e7
---
 src/gromacs/domdec/domdec.cpp                 |   5 +
 src/gromacs/domdec/domdec.h                   |   3 +
 src/gromacs/domdec/gpuhaloexchange.h          |   6 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cpp   |   4 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cu    |   4 +-
 src/gromacs/ewald/pme.h                       |  30 +-
 src/gromacs/ewald/pme_gpu.cpp                 |  35 +-
 src/gromacs/ewald/pme_gpu_internal.cpp        |  69 +++-
 src/gromacs/ewald/pme_gpu_internal.h          |  24 +-
 src/gromacs/ewald/pme_only.cpp                |  19 +-
 src/gromacs/ewald/tests/pmegathertest.cpp     |   5 +-
 .../ewald/tests/pmesplinespreadtest.cpp       |   5 +-
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  26 +-
 src/gromacs/ewald/tests/pmetestcommon.h       |  16 +-
 src/gromacs/mdlib/sim_util.cpp                |  78 ++--
 src/gromacs/mdlib/update_constrain_cuda.h     |  90 +----
 .../mdlib/update_constrain_cuda_impl.cpp      |  47 +--
 .../mdlib/update_constrain_cuda_impl.cu       | 111 ++---
 .../mdlib/update_constrain_cuda_impl.h        | 127 ++----
 src/gromacs/mdrun/md.cpp                      |  39 +-
 src/gromacs/mdrun/runner.cpp                  |  27 +-
 src/gromacs/mdtypes/CMakeLists.txt            |  26 +-
 src/gromacs/mdtypes/forcerec.h                |   6 +
 .../mdtypes/state_propagator_data_gpu.h       | 225 +++++++++++
 .../state_propagator_data_gpu_impl.cpp        | 156 +++++++
 .../mdtypes/state_propagator_data_gpu_impl.h  | 273 +++++++++++++
 .../state_propagator_data_gpu_impl_gpu.cpp    | 381 ++++++++++++++++++
 src/gromacs/nbnxm/atomdata.cpp                |  39 +-
 src/gromacs/nbnxm/atomdata.h                  |  48 +--
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 201 +--------
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |   9 -
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  12 -
 src/gromacs/nbnxm/nbnxm.cpp                   |  64 +--
 src/gromacs/nbnxm/nbnxm.h                     |  54 +--
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  90 +----
 35 files changed, 1418 insertions(+), 936 deletions(-)
 create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu.h
 create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
 create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
 create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 2807a97592..f3365e11a3 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -235,6 +235,11 @@ gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
     return &dd->comm->zones;
 }
 
+int dd_numAtomsZones(const gmx_domdec_t &dd)
+{
+    return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
+}
+
 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 {
     return dd.comm->atomRanges.numHomeAtoms();
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 9405a030ec..97b38a4300 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -110,6 +110,9 @@ void dd_store_state(struct gmx_domdec_t *dd, t_state *state);
 /*! \brief Returns a pointer to the gmx_domdec_zones_t struct */
 struct gmx_domdec_zones_t *domdec_zones(struct gmx_domdec_t *dd);
 
+/*! \brief Returns the range for atoms in zones*/
+int dd_numAtomsZones(const gmx_domdec_t &dd);
+
 /*! \brief Returns the number of home atoms */
 int dd_numHomeAtoms(const gmx_domdec_t &dd);
 
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index 4187e45cc2..f991e05123 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -42,6 +42,7 @@
 #ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
 #define GMX_DOMDEC_GPUHALOEXCHANGE_H
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/classhelpers.h"
@@ -95,8 +96,9 @@ class GpuHaloExchange
          * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
          * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
          */
-        void reinitHalo(rvec        *d_coordinateBuffer,
-                        rvec        *d_forcesBuffer);
+        void reinitHalo(DeviceBuffer<float>  d_coordinateBuffer,
+                        DeviceBuffer<float>  d_forcesBuffer);
+
 
         /*! \brief GPU halo exchange of coordinates buffer.
          *
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index 741c79519c..c4c191b314 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
 GpuHaloExchange::~GpuHaloExchange() = default;
 
 /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
-                                 rvec * /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
+                                 DeviceBuffer<float> /* d_forcesBuffer */)
 {
     GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
 }
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index dcb3bcd839..c195f96cb5 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -438,8 +438,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
 
 GpuHaloExchange::~GpuHaloExchange() = default;
 
-void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer,
-                                 rvec        *d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float>  d_coordinatesBuffer,
+                                 DeviceBuffer<float>  d_forcesBuffer)
 {
     impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
 }
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 6997d4e0eb..c6ec4e6fca 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -365,24 +365,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARG
                                                     bool            GPU_FUNC_ARGUMENT(useGpuForceReduction)) GPU_FUNC_TERM;
 
 /*! \brief
- * Launches H2D input transfers for PME on GPU.
- *
- * \param[in] pme               The PME data structure.
- * \param[in] coordinatesHost   The array of local atoms' coordinates.
- * \param[in] wcycle            The wallclock counter.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
-                                                        const rvec           *GPU_FUNC_ARGUMENT(coordinatesHost),
-                                                        gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
-
-/*! \brief
- * Launches first stage of PME on GPU - spreading kernel, and D2H grid transfer if needed.
+ * Launches first stage of PME on GPU - spreading kernel.
  *
  * \param[in] pme                The PME data structure.
  * \param[in] wcycle             The wallclock counter.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t           *GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle       *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
+                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
 
 /*! \brief
  * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
@@ -476,6 +465,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG
  */
 GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
 
+/*! \brief Set pointer to device copy of coordinate data.
+ * \param[in] pme            The PME data structure.
+ * \param[in] d_x            The pointer to the positions buffer to be set
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t     *GPU_FUNC_ARGUMENT(pme),
+                                             DeviceBuffer<float>  GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
 /*! \brief Get pointer to device copy of force data.
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to force data
@@ -488,6 +484,12 @@ GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT
  */
 GPU_FUNC_QUALIFIER void *pme_gpu_get_device_stream(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
+/*! \brief Returns the pointer to the GPU context.
+ *  \param[in] pme            The PME data structure.
+ *  \returns                  Pointer to GPU context object.
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_context(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to sychronizer
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index ddbc2f9485..4685913dbd 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -174,23 +174,6 @@ void pme_gpu_prepare_computation(gmx_pme_t            *pme,
     }
 }
 
-void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t            *pme,
-                                     const rvec           *coordinatesHost,
-                                     gmx_wallcycle        *wcycle)
-{
-    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
-    PmeGpu *pmeGpu = pme->gpu;
-
-    // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
-    wallcycle_start(wcycle, ewcLAUNCH_GPU);
-    // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
-    wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
-    pme_gpu_copy_input_coordinates(pmeGpu, coordinatesHost);
-    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-}
-
 void pme_gpu_launch_spread(gmx_pme_t            *pme,
                            gmx_wallcycle        *wcycle)
 {
@@ -444,6 +427,15 @@ void *pme_gpu_get_device_f(const gmx_pme_t *pme)
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
 
+void pme_gpu_set_device_x(const gmx_pme_t     *pme,
+                          DeviceBuffer<float>  d_x)
+{
+    GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
+}
+
 void *pme_gpu_get_device_stream(const gmx_pme_t *pme)
 {
     if (!pme || !pme_gpu_active(pme))
@@ -453,6 +445,15 @@ void *pme_gpu_get_device_stream(const gmx_pme_t *pme)
     return pme_gpu_get_stream(pme->gpu);
 }
 
+void *pme_gpu_get_device_context(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+    return pme_gpu_get_context(pme->gpu);
+}
+
 GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
 {
     if (!pme || !pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index c11580aa88..47ec41bb3c 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -233,23 +233,6 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
     }
 }
 
-void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
-{
-    GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
-#if GMX_DOUBLE
-    GMX_RELEASE_ASSERT(false, "Only single precision is supported");
-    GMX_UNUSED_VALUE(h_coordinates);
-#else
-    const float *h_coordinatesFloat = reinterpret_cast<const float *>(h_coordinates);
-    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat,
-                       0, pmeGpu->kernelParams->atoms.nAtoms * DIM,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
-    // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
-    //        Remove after adding the required event-based sync between the above H2D and the transform kernel
-    pme_gpu_synchronize(pmeGpu);
-#endif
-}
-
 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
 {
     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
@@ -967,7 +950,6 @@ void pme_gpu_destroy(PmeGpu *pmeGpu)
     pme_gpu_free_energy_virial(pmeGpu);
     pme_gpu_free_bspline_values(pmeGpu);
     pme_gpu_free_forces(pmeGpu);
-    pme_gpu_free_coordinates(pmeGpu);
     pme_gpu_free_coefficients(pmeGpu);
     pme_gpu_free_spline_data(pmeGpu);
     pme_gpu_free_grid_indices(pmeGpu);
@@ -1002,7 +984,6 @@ void pme_gpu_reinit_atoms(PmeGpu *pmeGpu, const int nAtoms, const real *charges)
 
     if (haveToRealloc)
     {
-        pme_gpu_realloc_coordinates(pmeGpu);
         pme_gpu_realloc_forces(pmeGpu);
         pme_gpu_realloc_spline_data(pmeGpu);
         pme_gpu_realloc_grid_indices(pmeGpu);
@@ -1317,6 +1298,44 @@ void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu)
     }
 }
 
+/*! \brief Check the validity of the device buffer.
+ *
+ * Checks if the buffer is not nullptr and, when possible, if it is big enough.
+ *
+ * \todo Split and move this function to gpu_utils.
+ *
+ * \param[in] buffer        Device buffer to be checked.
+ * \param[in] requiredSize  Number of elements that the buffer will have to accommodate.
+ *
+ * \returns If the device buffer can be set.
+ */
+template<typename T>
+static bool checkDeviceBuffer(gmx_unused DeviceBuffer<T> buffer, gmx_unused int requiredSize)
+{
+#if GMX_GPU == GMX_GPU_CUDA
+    GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr");
+    return buffer != nullptr;
+#elif GMX_GPU == GMX_GPU_OPENCL
+    size_t size;
+    int    retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, NULL);
+    GMX_ASSERT(retval == CL_SUCCESS, gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str());
+    GMX_ASSERT(static_cast<int>(size) >= requiredSize, "Number of atoms in device buffer is smaller then required size.");
+    return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
+#elif GMX_GPU == GMX_GPU_NONE
+    GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
+    return false;
+#endif
+}
+
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu *pmeGpu, DeviceBuffer<float> d_x)
+{
+    GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was initialized.");
+
+    GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms), "The device-side buffer can not be set.");
+
+    pmeGpu->kernelParams->atoms.d_coordinates = d_x;
+}
+
 void * pme_gpu_get_stream(const PmeGpu *pmeGpu)
 {
     if (pmeGpu)
@@ -1329,6 +1348,18 @@ void * pme_gpu_get_stream(const PmeGpu *pmeGpu)
     }
 }
 
+void * pme_gpu_get_context(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu)
+    {
+        return static_cast<void *>(&pmeGpu->archSpecific->context);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index d6aafdbc85..bdd92bb83b 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -185,17 +185,6 @@ bool pme_gpu_stream_query(const PmeGpu *pmeGpu);
  */
 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu);
 
-/*! \libinternal \brief
- * Copies the input coordinates from the CPU buffer onto the GPU.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
- *
- * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
-                                                       const rvec   *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM;
-
 /*! \libinternal \brief
  * Frees the coordinates on the GPU.
  *
@@ -448,6 +437,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(
  */
 GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
 
+/*! \brief Sets the device pointer to coordinate data
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \param[in] d_x            Pointer to coordinate data
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu        *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                            DeviceBuffer<float>  GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
 /*! \brief Return pointer to device copy of force data.
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to force data
@@ -460,6 +456,12 @@ GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_
  */
 GPU_FUNC_QUALIFIER void * pme_gpu_get_stream(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
+/*! \brief Return pointer to GPU context (for OpenCL builds).
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to context object.
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_context(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to sync object
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 1a872bb383..eb81ebb79f 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -84,6 +84,7 @@
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/fatalerror.h"
@@ -543,15 +544,21 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
     std::vector<gmx_pme_t *> pmedata;
     pmedata.push_back(pme);
 
-    auto       pme_pp       = gmx_pme_pp_init(cr);
+    auto        pme_pp       = gmx_pme_pp_init(cr);
     //TODO the variable below should be queried from the task assignment info
-    const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+    const bool  useGpuForPme   = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+    const void *commandStream  = useGpuForPme ? pme_gpu_get_device_context(pme) : nullptr;
+    const void *gpuContext     = useGpuForPme ? pme_gpu_get_device_stream(pme) : nullptr;
+    const int   paddingSize    = pme_gpu_get_padding_size(pme);
     if (useGpuForPme)
     {
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
     }
 
+    // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds
+    auto stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(commandStream, gpuContext, GpuApiCallBehavior::Sync, paddingSize);
+
     clear_nrnb(mynrnb);
 
     count = 0;
@@ -585,6 +592,11 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             if (atomSetChanged)
             {
                 gmx_pme_reinit_atoms(pme, natoms, pme_pp->chargeA.data());
+                if (useGpuForPme)
+                {
+                    stateGpu->reinit(natoms, natoms);
+                    pme_gpu_set_device_x(pme, stateGpu->getCoordinates());
+                }
             }
 
             if (ret == pmerecvqxRESETCOUNTERS)
@@ -625,7 +637,8 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             //TODO this should be set properly by gmx_pme_recv_coeffs_coords,
             // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
             pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
-            pme_gpu_copy_coordinates_to_gpu(pme, as_rvec_array(pme_pp->x.data()), wcycle);
+            stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+
             pme_gpu_launch_spread(pme, wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
             pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp
index db72510636..0d4e94db6c 100644
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -388,12 +388,13 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
             TestReferenceData refData;
             for (const auto &context : getPmeTestEnv()->getHardwareContexts())
             {
+                std::shared_ptr<StatePropagatorDataGpu> stateGpu;
                 CodePath   codePath       = context->getCodePath();
                 const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
                 if (!supportedInput)
                 {
                     /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box, stateGpu), NotImplementedError);
                     continue;
                 }
 
@@ -408,7 +409,7 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
                                           ));
 
                 PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
-                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
+                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box, stateGpu);
 
                 /* Setting some more inputs */
                 pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
index eef1b9fe2d..a1c6eb4c1d 100644
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -122,12 +122,13 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
 
             for (const auto &context : getPmeTestEnv()->getHardwareContexts())
             {
+                std::shared_ptr<StatePropagatorDataGpu> stateGpu;
                 CodePath   codePath       = context->getCodePath();
                 const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
                 if (!supportedInput)
                 {
                     /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box, stateGpu), NotImplementedError);
                     continue;
                 }
 
@@ -146,7 +147,7 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
                     /* Running the test */
 
                     PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
-                                                          context->getPmeGpuProgram(), coordinates, charges, box);
+                                                          context->getPmeGpuProgram(), coordinates, charges, box, stateGpu);
 
                     const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
                     const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index d5ce888798..61b28e587a 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -169,13 +169,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
 }
 
 //! PME initialization with atom data
-PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const CoordinatesVector  &coordinates,
-                            const ChargesVector      &charges,
-                            const Matrix3x3          &box
+PmeSafePointer pmeInitAtoms(const t_inputrec                        *inputRec,
+                            CodePath                                 mode,
+                            const gmx_device_info_t                 *gpuInfo,
+                            PmeGpuProgramHandle                      pmeGpuProgram,
+                            const CoordinatesVector                 &coordinates,
+                            const ChargesVector                     &charges,
+                            const Matrix3x3                         &box,
+                            std::shared_ptr<StatePropagatorDataGpu>  stateGpu
                             )
 {
     const index     atomCount = coordinates.size();
@@ -199,7 +200,16 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
             // We need to set atc->n for passing the size in the tests
             atc->setNumAtoms(atomCount);
             gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
-            pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+
+            // TODO: Pin the host buffer and use async memory copies
+            stateGpu = std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(pmeSafe.get()),
+                                                                pme_gpu_get_device_context(pmeSafe.get()),
+                                                                GpuApiCallBehavior::Sync,
+                                                                pme_gpu_get_padding_size(pmeSafe.get()));
+            stateGpu->reinit(atomCount, atomCount);
+            stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()), gmx::StatePropagatorDataGpu::AtomLocality::All);
+            pme_gpu_set_kernelparam_coordinates(pmeSafe->gpu, stateGpu->getCoordinates());
+
             break;
 
         default:
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h
index e9290dbae9..b51551c29a 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -51,6 +51,7 @@
 #include "gromacs/ewald/pme.h"
 #include "gromacs/ewald/pme_gpu_internal.h"
 #include "gromacs/math/gmxcomplex.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/unique_cptr.h"
 
@@ -125,13 +126,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
                             const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}},
                             real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F);
 //! PME initialization with atom data and system box
-PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const CoordinatesVector  &coordinates,
-                            const ChargesVector      &charges,
-                            const Matrix3x3          &box
+PmeSafePointer pmeInitAtoms(const t_inputrec                        *inputRec,
+                            CodePath                                 mode,
+                            const gmx_device_info_t                 *gpuInfo,
+                            PmeGpuProgramHandle                      pmeGpuProgram,
+                            const CoordinatesVector                 &coordinates,
+                            const ChargesVector                     &charges,
+                            const Matrix3x3                         &box,
+                            std::shared_ptr<StatePropagatorDataGpu>  stateGpu
                             );
 //! PME spline computation and charge spreading
 void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode,
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 16c891532d..a77e5751cf 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -86,6 +86,7 @@
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
@@ -601,7 +602,6 @@ computeSpecialForces(FILE                          *fplog,
  *
  * \param[in]  pmedata              The PME structure
  * \param[in]  box                  The box matrix
- * \param[in]  x                    Coordinate array
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  pmeFlags             PME flags
  * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
@@ -609,14 +609,12 @@ computeSpecialForces(FILE                          *fplog,
  */
 static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
                                       const matrix        box,
-                                      const rvec          x[],
                                       const StepWorkload &stepWork,
                                       int                 pmeFlags,
                                       bool                useGpuForceReduction,
                                       gmx_wallcycle_t     wcycle)
 {
     pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
-    pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle);
     pme_gpu_launch_spread(pmedata, wcycle);
 }
 
@@ -889,12 +887,13 @@ void do_force(FILE                                     *fplog,
               int                                       legacyFlags,
               const DDBalanceRegionHandler             &ddBalanceRegionHandler)
 {
-    int                  i, j;
-    double               mu[2*DIM];
-    gmx_bool             bFillGrid, bCalcCGCM;
-    gmx_bool             bUseGPU, bUseOrEmulGPU;
-    nonbonded_verlet_t  *nbv = fr->nbv.get();
-    interaction_const_t *ic  = fr->ic;
+    int                          i, j;
+    double                       mu[2*DIM];
+    gmx_bool                     bFillGrid, bCalcCGCM;
+    gmx_bool                     bUseGPU, bUseOrEmulGPU;
+    nonbonded_verlet_t          *nbv      = fr->nbv.get();
+    interaction_const_t         *ic       = fr->ic;
+    gmx::StatePropagatorDataGpu *stateGpu = fr->stateGpu;
 
     // TODO remove the code below when the legacy flags are not in use anymore
     /* modify force flag if not doing nonbonded */
@@ -998,9 +997,27 @@ void do_force(FILE                                     *fplog,
     }
 #endif /* GMX_MPI */
 
+    // Coordinates on the device are needed if PME or BufferOps are offloaded.
+    // The local coordinates can be copied right away.
+    // NOTE: Consider moving this copy to right after they are updated and constrained,
+    //       if the later is not offloaded.
+    if (useGpuPme || useGpuXBufOps == BufferOpsUseGpu::True)
+    {
+        if (stepWork.doNeighborSearch)
+        {
+            stateGpu->reinit(mdatoms->homenr, cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
+            if (useGpuPme)
+            {
+                // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
+                pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
+            }
+        }
+        stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+    }
+
     if (useGpuPme)
     {
-        launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
     }
 
     /* do gridding for pair search */
@@ -1124,14 +1141,8 @@ void do_force(FILE                                     *fplog,
     {
         if (useGpuXBufOps == BufferOpsUseGpu::True)
         {
-            // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
-            if (!useGpuPme)
-            {
-                nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::Local, false,
-                                          x.unpaddedArrayRef());
-            }
             nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
-                                       useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+                                       stateGpu->getCoordinates());
         }
         else
         {
@@ -1210,9 +1221,7 @@ void do_force(FILE                                     *fplog,
             wallcycle_stop(wcycle, ewcNS);
             if (ddUsesGpuDirectCommunication)
             {
-                rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
-                rvec* d_f    = static_cast<rvec *> (nbv->get_gpu_frvec());
-                gpuHaloExchange->reinitHalo(d_x, d_f);
+                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
             }
         }
         else
@@ -1226,7 +1235,7 @@ void do_force(FILE                                     *fplog,
                 if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                 {
                     //non-local part of coordinate buffer must be copied back to host for CPU work
-                    nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
             }
             else
@@ -1239,11 +1248,10 @@ void do_force(FILE                                     *fplog,
                 // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
                 if (!useGpuPme && !ddUsesGpuDirectCommunication)
                 {
-                    nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                              x.unpaddedArrayRef());
+                    stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                           useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+                                           stateGpu->getCoordinates());
             }
             else
             {
@@ -1494,17 +1502,16 @@ void do_force(FILE                                     *fplog,
                 // which are a dependency for the GPU force reduction.
                 bool  haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
 
-                rvec *f = as_rvec_array(forceWithShiftForces.force().data());
                 if (haveNonLocalForceContribInCpuBuffer)
                 {
-                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
-                                                  nbv->getDeviceForces(),
+                                                  stateGpu->getForces(),
                                                   pme_gpu_get_device_f(fr->pmedata),
                                                   pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                                   useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
-                nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
+                stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
             }
             else
             {
@@ -1538,17 +1545,14 @@ void do_force(FILE                                     *fplog,
 
         if (stepWork.computeForces)
         {
-            gmx::ArrayRef<gmx::RVec>  force  = forceOut.forceWithShiftForces().force();
-            rvec                     *f      = as_rvec_array(force.data());
 
             if (useGpuForcesHaloExchange)
             {
                 if (haveCpuLocalForces)
                 {
-                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
                 }
-                bool accumulateHaloForces = haveCpuLocalForces;
-                gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
+                gpuHaloExchange->communicateHaloForces(haveCpuLocalForces);
             }
             else
             {
@@ -1643,10 +1647,9 @@ void do_force(FILE                                     *fplog,
             // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
             //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
             //   These should be unified.
-            rvec *f = as_rvec_array(forceWithShift.data());
             if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
             {
-                nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
             }
             if (useGpuForcesHaloExchange)
             {
@@ -1658,12 +1661,13 @@ void do_force(FILE                                     *fplog,
                 nbv->stream_local_wait_for_nonlocal();
             }
             nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
-                                              nbv->getDeviceForces(),
+                                              stateGpu->getForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
                                               pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                               useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
-            nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
+            // This function call synchronizes the local stream
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
+            stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
         }
         else
         {
diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h
index 46b173ddab..714ad7e824 100644
--- a/src/gromacs/mdlib/update_constrain_cuda.h
+++ b/src/gromacs/mdlib/update_constrain_cuda.h
@@ -48,6 +48,7 @@
 #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
 #define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/mdtypes/group.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
@@ -84,7 +85,7 @@ class UpdateConstrainCuda
          * This will extract temperature scaling factors from tcstat, transform them into the plain
          * array and call the normal integrate method.
          *
-         * \param[in]  dt                     Timestep
+         * \param[in]  dt                     Timestep.
          * \param[in]  updateVelocities       If the velocities should be constrained.
          * \param[in]  computeVirial          If virial should be updated.
          * \param[out] virial                 Place to save virial tensor.
@@ -104,16 +105,21 @@ class UpdateConstrainCuda
                        float                             dtPressureCouple,
                        const matrix                      velocityScalingMatrix);
 
-        /*! \brief
-         * Update data-structures (e.g. after NB search step).
+        /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
          *
-         * \param[in] idef                 System topology
-         * \param[in] md                   Atoms data.
-         * \param[in] numTempScaleValues   Number of temperature scaling groups. Zero for no temperature scaling.
+         * \param[in,out]  d_x                 Device buffer with coordinates.
+         * \param[in,out]  d_v                 Device buffer with velocities.
+         * \param[in]      d_f                 Device buffer with forces.
+         * \param[in]      idef                System topology
+         * \param[in]      md                  Atoms data.
+         * \param[in]      numTempScaleValues  Number of temperature scaling groups. Zero for no temperature scaling.
          */
-        void set(const t_idef    &idef,
-                 const t_mdatoms &md,
-                 int              numTempScaleValues);
+        void set(DeviceBuffer<float>  d_x,
+                 DeviceBuffer<float>  d_v,
+                 DeviceBuffer<float>  d_f,
+                 const t_idef        &idef,
+                 const t_mdatoms     &md,
+                 int                  numTempScaleValues);
 
         /*! \brief
          * Update PBC data.
@@ -124,72 +130,6 @@ class UpdateConstrainCuda
          */
         void setPbc(const t_pbc *pbc);
 
-        /*! \brief
-         * Copy coordinates from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_x  CPU pointer where coordinates should be copied from.
-         */
-        void copyCoordinatesToGpu(const rvec *h_x);
-
-        /*! \brief
-         * Copy velocities from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  CPU pointer where velocities should be copied from.
-         */
-        void copyVelocitiesToGpu(const rvec *h_v);
-
-        /*! \brief
-         * Copy forces from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  CPU pointer where forces should be copied from.
-         */
-        void copyForcesToGpu(const rvec *h_f);
-
-        /*! \brief
-         * Copy coordinates from GPU to CPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[out] h_xp CPU pointer where coordinates should be copied to.
-         */
-        void copyCoordinatesFromGpu(rvec *h_xp);
-
-        /*! \brief
-         * Copy velocities from GPU to CPU.
-         *
-         * The velocities are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  Pointer to velocities data.
-         */
-        void copyVelocitiesFromGpu(rvec *h_v);
-
-        /*! \brief
-         * Copy forces from GPU to CPU.
-         *
-         * The forces are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  Pointer to forces data.
-         */
-        void copyForcesFromGpu(rvec *h_f);
-
-        /*! \brief
-         * Set the internal GPU-memory d_x, d_xp and d_v pointers.
-         *
-         * Data is not copied. The data are assumed to be in float3/fvec format
-         * (float3 is used internally, but the data layout should be identical).
-         *
-         * \param[in] d_x  Pointer to the coordinates for the input (on GPU)
-         * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
-         * \param[in] d_v  Pointer to the velocities (on GPU)
-         * \param[in] d_f  Pointer to the forces (on GPU)
-         */
-        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
 
     private:
         class Impl;
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
index 075ca163d5..b5823ed97f 100644
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
@@ -78,9 +78,12 @@ void UpdateConstrainCuda::integrate(gmx_unused const real
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
 }
 
-void UpdateConstrainCuda::set(gmx_unused const t_idef     &idef,
-                              gmx_unused const t_mdatoms  &md,
-                              gmx_unused const int         numTempScaleValues)
+void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float>        d_x,
+                              gmx_unused DeviceBuffer<float>        d_v,
+                              gmx_unused const DeviceBuffer<float>  d_f,
+                              gmx_unused const t_idef              &idef,
+                              gmx_unused const t_mdatoms           &md,
+                              gmx_unused const int                  numTempScaleValues)
 {
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
 }
@@ -90,44 +93,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc)
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
 }
 
-void UpdateConstrainCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(gmx_unused const rvec *h_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(gmx_unused rvec *h_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::setXVFPointers(gmx_unused rvec *d_x,
-                                         gmx_unused rvec *d_xp,
-                                         gmx_unused rvec *d_v,
-                                         gmx_unused rvec *d_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
 }      // namespace gmx
 
 #endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
index f373d27911..9ba9626738 100644
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -105,6 +105,9 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
         }
     }
 
+    // TODO: This should be eliminated
+    cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice);
+
     return;
 }
 
@@ -124,16 +127,24 @@ UpdateConstrainCuda::Impl::~Impl()
 {
 }
 
-void UpdateConstrainCuda::Impl::set(const t_idef    &idef,
-                                    const t_mdatoms &md,
-                                    const int        numTempScaleValues)
+void UpdateConstrainCuda::Impl::set(DeviceBuffer<float>        d_x,
+                                    DeviceBuffer<float>        d_v,
+                                    const DeviceBuffer<float>  d_f,
+                                    const t_idef              &idef,
+                                    const t_mdatoms           &md,
+                                    const int                  numTempScaleValues)
 {
+    GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
+    GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
+    GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
+
+    d_x_ = reinterpret_cast<float3*>(d_x);
+    d_v_ = reinterpret_cast<float3*>(d_v);
+    d_f_ = reinterpret_cast<float3*>(d_f);
+
     numAtoms_ = md.nr;
 
-    reallocateDeviceBuffer(&d_x_,  numAtoms_, &numX_,  &numXAlloc_,  nullptr);
     reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
-    reallocateDeviceBuffer(&d_v_,  numAtoms_, &numV_,  &numVAlloc_,  nullptr);
-    reallocateDeviceBuffer(&d_f_,  numAtoms_, &numF_,  &numFAlloc_,  nullptr);
 
     reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_,
                            &numInverseMasses_, &numInverseMassesAlloc_, nullptr);
@@ -152,44 +163,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
     settleCuda_->setPbc(pbc);
 }
 
-void UpdateConstrainCuda::Impl::copyCoordinatesToGpu(const rvec *h_x)
-{
-    copyToDeviceBuffer(&d_x_, (float3*)h_x, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
-{
-    copyToDeviceBuffer(&d_v_, (float3*)h_v, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesToGpu(const rvec *h_f)
-{
-    copyToDeviceBuffer(&d_f_, (float3*)h_f, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
-{
-    copyFromDeviceBuffer((float3*)h_xp, &d_xp_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
-{
-    copyFromDeviceBuffer((float3*)h_v, &d_v_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesFromGpu(rvec *h_f)
-{
-    copyFromDeviceBuffer((float3*)h_f, &d_f_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
-    d_x_  = (float3*)d_x;
-    d_xp_ = (float3*)d_xp;
-    d_v_  = (float3*)d_v;
-    d_f_  = (float3*)d_f;
-}
-
 UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec  &ir,
                                          const gmx_mtop_t  &mtop,
                                          const void        *commandStream)
@@ -207,18 +180,21 @@ void UpdateConstrainCuda::integrate(const real                        dt,
                                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
                                     const bool                        doPressureCouple,
                                     const float                       dtPressureCouple,
-                                    const matrix                      pRVScalingMatrix)
+                                    const matrix                      velocityScalingMatrix)
 {
     impl_->integrate(dt, updateVelocities, computeVirial, virialScaled,
                      doTempCouple, tcstat,
-                     doPressureCouple, dtPressureCouple, pRVScalingMatrix);
+                     doPressureCouple, dtPressureCouple, velocityScalingMatrix);
 }
 
-void UpdateConstrainCuda::set(const t_idef    &idef,
-                              const t_mdatoms &md,
-                              const int        numTempScaleValues)
+void UpdateConstrainCuda::set(DeviceBuffer<float>        d_x,
+                              DeviceBuffer<float>        d_v,
+                              const DeviceBuffer<float>  d_f,
+                              const t_idef              &idef,
+                              const t_mdatoms           &md,
+                              const int                  numTempScaleValues)
 {
-    impl_->set(idef, md, numTempScaleValues);
+    impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
 }
 
 void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
@@ -226,39 +202,4 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
     impl_->setPbc(pbc);
 }
 
-void UpdateConstrainCuda::copyCoordinatesToGpu(const rvec *h_x)
-{
-    impl_->copyCoordinatesToGpu(h_x);
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(const rvec *h_v)
-{
-    impl_->copyVelocitiesToGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(const rvec *h_f)
-{
-    impl_->copyForcesToGpu(h_f);
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(rvec *h_xp)
-{
-    impl_->copyCoordinatesFromGpu(h_xp);
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(rvec *h_v)
-{
-    impl_->copyVelocitiesFromGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(rvec *h_f)
-{
-    impl_->copyForcesFromGpu(h_f);
-}
-
-void UpdateConstrainCuda::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
-    impl_->setXVFPointers(d_x, d_xp, d_v, d_f);
-}
-
 } //namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h
index b2f4c0eb5e..652dd84eb6 100644
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.h
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h
@@ -87,7 +87,7 @@ class UpdateConstrainCuda::Impl
          *   2. This is the temperature coupling step.
          * Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
          *
-         * \param[in]  dt                     Timestep
+         * \param[in]  dt                     Timestep.
          * \param[in]  updateVelocities       If the velocities should be constrained.
          * \param[in]  computeVirial          If virial should be updated.
          * \param[out] virial                 Place to save virial tensor.
@@ -97,26 +97,31 @@ class UpdateConstrainCuda::Impl
          * \param[in]  dtPressureCouple       Period between pressure coupling steps
          * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
          */
-        void integrate(const real                        dt,
-                       const bool                        updateVelocities,
-                       const bool                        computeVirial,
+        void integrate(real                              dt,
+                       bool                              updateVelocities,
+                       bool                              computeVirial,
                        tensor                            virial,
-                       const bool                        doTempCouple,
+                       bool                              doTempCouple,
                        gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                       const bool                        doPressureCouple,
-                       const float                       dtPressureCouple,
+                       bool                              doPressureCouple,
+                       float                             dtPressureCouple,
                        const matrix                      velocityScalingMatrix);
 
-        /*! \brief
-         * Update data-structures (e.g. after NB search step).
+        /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
          *
-         * \param[in] idef                 System topology
-         * \param[in] md                   Atoms data.
-         * \param[in] numTempScaleValues   Number of temperature scaling groups. Set zero for no temperature coupling.
+         * \param[in,out]  d_x            Device buffer with coordinates.
+         * \param[in,out]  d_v            Device buffer with velocities.
+         * \param[in]      d_f            Device buffer with forces.
+         * \param[in] idef                System topology
+         * \param[in] md                  Atoms data.
+         * \param[in] numTempScaleValues  Number of temperature scaling groups. Set zero for no temperature coupling.
          */
-        void set(const t_idef    &idef,
-                 const t_mdatoms &md,
-                 const int        numTempScaleValues);
+        void set(DeviceBuffer<float>        d_x,
+                 DeviceBuffer<float>        d_v,
+                 const DeviceBuffer<float>  d_f,
+                 const t_idef              &idef,
+                 const t_mdatoms           &md,
+                 const int                  numTempScaleValues);
 
         /*! \brief
          * Update PBC data.
@@ -127,73 +132,6 @@ class UpdateConstrainCuda::Impl
          */
         void setPbc(const t_pbc *pbc);
 
-        /*! \brief
-         * Copy coordinates from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_x  CPU pointer where coordinates should be copied from.
-         */
-        void copyCoordinatesToGpu(const rvec *h_x);
-
-        /*! \brief
-         * Copy velocities from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  CPU pointer where velocities should be copied from.
-         */
-        void copyVelocitiesToGpu(const rvec *h_v);
-
-        /*! \brief
-         * Copy forces from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  CPU pointer where forces should be copied from.
-         */
-        void copyForcesToGpu(const rvec *h_f);
-
-        /*! \brief
-         * Copy coordinates from GPU to CPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[out] h_xp CPU pointer where coordinates should be copied to.
-         */
-        void copyCoordinatesFromGpu(rvec *h_xp);
-
-        /*! \brief
-         * Copy velocities from GPU to CPU.
-         *
-         * The velocities are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  Pointer to velocities data.
-         */
-        void copyVelocitiesFromGpu(rvec *h_v);
-
-        /*! \brief
-         * Copy forces from GPU to CPU.
-         *
-         * The forces are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  Pointer to forces data.
-         */
-        void copyForcesFromGpu(rvec *h_f);
-
-        /*! \brief
-         * Set the internal GPU-memory x, xprime and v pointers.
-         *
-         * Data is not copied. The data are assumed to be in float3/fvec format
-         * (float3 is used internally, but the data layout should be identical).
-         *
-         * \param[in] d_x   Pointer to the coordinates for the input (on GPU)
-         * \param[in] d_xp  Pointer to the coordinates for the output (on GPU)
-         * \param[in] d_v   Pointer to the velocities (on GPU)
-         * \param[in] d_f   Pointer to the forces (on GPU)
-         */
-        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
-
     private:
 
         //! CUDA stream
@@ -205,33 +143,20 @@ class UpdateConstrainCuda::Impl
         //! Number of atoms
         int                 numAtoms_;
 
-        //! Coordinates before the timestep (on GPU)
+        //! Local copy of the pointer to the device positions buffer
         float3             *d_x_;
-        //! Number of elements in coordinates buffer
-        int                 numX_                  = -1;
-        //! Allocation size for the coordinates buffer
-        int                 numXAlloc_             = -1;
+        //! Local copy of the pointer to the device velocities buffer
+        float3             *d_v_;
+        //! Local copy of the pointer to the device forces buffer
+        float3             *d_f_;
 
-        //! Coordinates after the timestep (on GPU).
+        //! Device buffer for intermediate positions (maintained internally)
         float3             *d_xp_;
         //! Number of elements in shifted coordinates buffer
         int                 numXp_                 = -1;
         //! Allocation size for the shifted coordinates buffer
         int                 numXpAlloc_            = -1;
 
-        //! Velocities of atoms (on GPU)
-        float3             *d_v_;
-        //! Number of elements in velocities buffer
-        int                 numV_                  = -1;
-        //! Allocation size for the velocities buffer
-        int                 numVAlloc_             = -1;
-
-        //! Forces, exerted by atoms (on GPU)
-        float3             *d_f_;
-        //! Number of elements in forces buffer
-        int                 numF_                  = -1;
-        //! Allocation size for the forces buffer
-        int                 numFAlloc_             = -1;
 
         //! 1/mass for all atoms (GPU)
         real               *d_inverseMasses_;
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 370cc51175..99f2a4f0de 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -117,7 +117,9 @@
 #include "gromacs/mdtypes/observableshistory.h"
 #include "gromacs/mdtypes/pullhistory.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/modularsimulator/energyelement.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -315,8 +317,15 @@ void gmx::LegacySimulator::do_md()
         upd.setNumAtoms(state->natoms);
     }
 
+/*****************************************************************************************/
+// TODO: The following block of code should be refactored, once:
+//       1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
+//       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
+//          stream owned by the StatePropagatorDataGpu
     bool useGpuForPme       = (fr->pmedata != nullptr) && (pme_run_mode(fr->pmedata) != PmeRunMode::CPU);
     bool useGpuForNonbonded = fr->nbv->useGpu();
+    // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
+    bool useGpuForBufferOps   = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 
     if (useGpuForUpdate)
     {
@@ -346,10 +355,19 @@ void gmx::LegacySimulator::do_md()
         integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, nullptr);
     }
 
-    if (fr->nbv->useGpu())
+    if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
     {
-        changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+        changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
     }
+    if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+    {
+        changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
+    }
+    if (useGpuForUpdate)
+    {
+        changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
+    }
+/*****************************************************************************************/
 
     // NOTE: The global state is no longer used at this point.
     // But state_global is still used as temporary storage space for writing
@@ -1200,16 +1218,19 @@ void gmx::LegacySimulator::do_md()
 
         if (useGpuForUpdate)
         {
+            StatePropagatorDataGpu *stateGpu = fr->stateGpu;
             if (bNS)
             {
-                integrator->set(top.idef, *mdatoms, ekind->ngtc);
+                integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
+                                top.idef, *mdatoms, ekind->ngtc);
                 t_pbc pbc;
                 set_pbc(&pbc, epbcXYZ, state->box);
                 integrator->setPbc(&pbc);
             }
-            integrator->copyCoordinatesToGpu(state->x.rvec_array());
-            integrator->copyVelocitiesToGpu(state->v.rvec_array());
-            integrator->copyForcesToGpu(as_rvec_array(f.data()));
+
+            stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
 
             bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
             bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
@@ -1218,9 +1239,9 @@ void gmx::LegacySimulator::do_md()
             integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir,
                                   doTempCouple, ekind->tcstat,
                                   doPressureCouple, ir->nstpcouple*ir->delta_t, M);
-
-            integrator->copyCoordinatesFromGpu(state->x.rvec_array());
-            integrator->copyVelocitiesFromGpu(state->v.rvec_array());
+            stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->synchronizeStream();
         }
         else
         {
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 1793b91d54..7bf64f9185 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -115,6 +115,7 @@
 #include "gromacs/mdtypes/observableshistory.h"
 #include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist_tuning.h"
@@ -1501,6 +1502,27 @@ int Mdrunner::mdrunner()
                                                          fcd->orires.nr != 0,
                                                          fcd->disres.nsystems != 0);
 
+        const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr;
+        const void *gpuContext    = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr;
+        const int   paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
+
+        const bool  inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
+                    false,
+                    inputrec, doRerun, vsite.get(), ms, replExParams,
+                    fcd, static_cast<int>(filenames.size()), filenames.data(),
+                    &observablesHistory, membed);
+
+        const bool          useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
+        GpuApiCallBehavior  transferKind        = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
+
+        // We initialize GPU state even for the CPU runs so we will have a more verbose
+        // error if someone will try accessing it from the CPU codepath
+        gmx::StatePropagatorDataGpu stateGpu(commandStream,
+                                             gpuContext,
+                                             transferKind,
+                                             paddingSize);
+        fr->stateGpu = &stateGpu;
+
         // TODO This is not the right place to manage the lifetime of
         // this data structure, but currently it's the easiest way to
         // make it work.
@@ -1510,11 +1532,6 @@ int Mdrunner::mdrunner()
         SimulatorBuilder simulatorBuilder;
 
         // build and run simulator object based on user-input
-        const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
-                    false,
-                    inputrec, doRerun, vsite.get(), ms, replExParams,
-                    fcd, static_cast<int>(filenames.size()), filenames.data(),
-                    &observablesHistory, membed);
         auto simulator = simulatorBuilder.build(
                     inputIsCompatibleWithModularSimulator,
                     fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
diff --git a/src/gromacs/mdtypes/CMakeLists.txt b/src/gromacs/mdtypes/CMakeLists.txt
index 788b6cf3bc..f9adf04c23 100644
--- a/src/gromacs/mdtypes/CMakeLists.txt
+++ b/src/gromacs/mdtypes/CMakeLists.txt
@@ -32,7 +32,31 @@
 # To help us fund GROMACS development, we humbly ask that you cite
 # the research papers on the package. Check out http://www.gromacs.org.
 
-file(GLOB MDTYPES_SOURCES *.cpp)
+file(GLOB MDTYPES_SOURCES
+    df_history.cpp
+    group.cpp
+    iforceprovider.cpp
+    inputrec.cpp
+    md_enums.cpp
+    observableshistory.cpp
+    state.cpp)
+
+if(GMX_USE_CUDA OR GMX_USE_OPENCL)
+    gmx_add_libgromacs_sources(
+       state_propagator_data_gpu_impl_gpu.cpp
+       )
+    if(GMX_USE_CUDA)
+        gmx_compile_cpp_as_cuda(
+            state_propagator_data_gpu_impl_gpu.cpp
+            )
+    endif()
+else()
+    gmx_add_libgromacs_sources(
+      state_propagator_data_gpu_impl.cpp
+      )
+endif()
+
+
 set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${MDTYPES_SOURCES} PARENT_SCOPE)
 
 if(GMX_INSTALL_LEGACY_API)
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index ec01003d24..540d0d073e 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -60,6 +60,7 @@ namespace gmx
 {
 class GpuBonded;
 class ForceProviders;
+class StatePropagatorDataGpu;
 }
 
 /* macros for the cginfo data in forcerec
@@ -268,6 +269,11 @@ struct t_forcerec { // NOLINT (clang-analyzer-optin.performance.Padding)
     struct ewald_corr_thread_t *ewc_t       = nullptr;
 
     gmx::ForceProviders        *forceProviders = nullptr;
+
+    // The stateGpu object is created in runner, forcerec just keeps the copy of the pointer.
+    // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
+    //       general StatePropagatorData object that is passed around
+    gmx::StatePropagatorDataGpu  *stateGpu = nullptr;
 };
 
 /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
new file mode 100644
index 0000000000..cf20737bce
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -0,0 +1,225 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of interfaces for GPU state data propagator object.
+ *
+ * This object stores and manages positions, velocities and forces for
+ * all particles in the system on the GPU.
+ *
+ * \todo Add cycle counters.
+ * \todo Add synchronization points.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \inlibraryapi
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu
+{
+    public:
+
+        /*! \brief Atom locality indicator: local, non-local, all.
+         *
+         * \todo This should be managed by a separate object, since the localities
+         *       are used here and in buffer ops.
+         */
+        enum class AtomLocality : int
+        {
+            Local    = 0, //!< Local atoms
+            NonLocal = 1, //!< Non-local atoms
+            All      = 2, //!< Both local and non-local atoms
+            Count    = 3  //!< The number of atom locality types
+        };
+
+        /*! \brief Constructor
+         *
+         * The buffers are reallocated only at the reinit call, the padding is
+         * used there for the coordinates buffer. It is needed for PME and added at
+         * the end of the buffer. It is assumed that if the rank has PME duties on the
+         * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+         * coordinates buffer is not split into local and non-local ranges. For other
+         * ranks, the padding size is zero. This works because only one rank ever does
+         * PME work on the GPU, and if that rank also does PP work that is the only
+         * rank. So all coordinates are always transferred.
+         *
+         * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+         *       does not run on the GPU).
+         *
+         * \todo Make \p CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make \p Context visible in CPU parts of the code so we will not
+         *       have to pass a void*.
+         *
+         *  \param[in] commandStream  GPU stream, nullptr allowed.
+         *  \param[in] gpuContext     GPU context, nullptr allowed.
+         *  \param[in] transferKind   H2D/D2H transfer call behavior (synchronous or not).
+         *  \param[in] paddingSize    Padding size for coordinates buffer.
+         */
+        StatePropagatorDataGpu(const void        *commandStream,
+                               const void        *gpuContext,
+                               GpuApiCallBehavior transferKind,
+                               int                paddingSize);
+
+        ~StatePropagatorDataGpu();
+
+        /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+         *
+         * The coordinates buffer is reallocated with the padding added at the end. The
+         * size of padding is set by the constructor.
+         *
+         *  \param[in] numAtomsLocal  Number of atoms in local domain.
+         *  \param[in] numAtomsAll    Total number of atoms to handle.
+         */
+        void reinit(int numAtomsLocal, int numAtomsAll);
+
+        /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+         *
+         * \todo There are at least three versions of the function with this functionality in the code:
+         *       this one and two more in NBNXM. These should be unified in a shape of a general function
+         *       in DD.
+         *
+         * \param[in]  atomLocality    If all, local or non-local ranges are needed.
+         *
+         * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+         */
+        std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality  atomLocality);
+
+
+        /*! \brief Get the positions buffer on the GPU.
+         *
+         *  \returns GPU positions buffer.
+         */
+        DeviceBuffer<float> getCoordinates();
+
+        /*! \brief Copy positions to the GPU memory.
+         *
+         *  \param[in] h_x           Positions in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
+                                  AtomLocality                    atomLocality);
+
+        /*! \brief Copy positions from the GPU memory.
+         *
+         *  \param[in] h_x           Positions buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                    AtomLocality              atomLocality);
+
+
+        /*! \brief Get the velocities buffer on the GPU.
+         *
+         *  \returns GPU velocities buffer.
+         */
+        DeviceBuffer<float> getVelocities();
+
+        /*! \brief Copy velocities to the GPU memory.
+         *
+         *  \param[in] h_v           Velocities in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
+                                 AtomLocality                    atomLocality);
+
+        /*! \brief Copy velocities from the GPU memory.
+         *
+         *  \param[in] h_v           Velocities buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                   AtomLocality              atomLocality);
+
+
+        /*! \brief Get the force buffer on the GPU.
+         *
+         *  \returns GPU force buffer.
+         */
+        DeviceBuffer<float> getForces();
+
+        /*! \brief Copy forces to the GPU memory.
+         *
+         *  \param[in] h_f           Forces in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
+                             AtomLocality                    atomLocality);
+
+        /*! \brief Copy forces from the GPU memory.
+         *
+         *  \param[in] h_f           Forces buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                               AtomLocality              atomLocality);
+        /*! \brief Synchronize the underlying GPU stream
+         */
+        void synchronizeStream();
+
+        /*! \brief Getter for the number of local atoms.
+         *
+         *  \returns The number of local atoms.
+         */
+        int numAtomsLocal();
+
+        /*! \brief Getter for the total number of atoms.
+         *
+         *  \returns The total number of atoms.
+         */
+        int numAtomsAll();
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
new file mode 100644
index 0000000000..dd0aa76f80
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -0,0 +1,156 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief The CPU stub for the state propagator data class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+
+#if GMX_GPU == GMX_GPU_NONE
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+};
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void *       /* commandStream */,
+                                               const void *       /* gpuContext    */,
+                                               GpuApiCallBehavior /* transferKind  */,
+                                               int                /* paddingSize   */)
+    : impl_(nullptr)
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu()
+{
+}
+
+void StatePropagatorDataGpu::reinit(int  /* numAtomsLocal */,
+                                    int  /* numAtomsAll   */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return std::make_tuple(0, 0);
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_x          */,
+                                                  AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_x          */,
+                                                    AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_v          */,
+                                                 AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_v          */,
+                                                   AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_f          */,
+                                             AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_f          */,
+                                               AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return 0;
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return 0;
+}
+
+}      // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
new file mode 100644
index 0000000000..fe8dca0456
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -0,0 +1,273 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of low-level functions and fields of GPU state propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+    public:
+
+        Impl();
+
+
+        /*! \brief Constructor
+         *
+         * The buffers are reallocated only at the reinit call, the padding is
+         * used there for the coordinates buffer. It is needed for PME and added at
+         * the end of the buffer. It is assumed that if the rank has PME duties on the
+         * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+         * coordinates buffer is not split into local and non-local ranges. For other
+         * ranks, the padding size is zero. This works because only one rank ever does
+         * PME work on the GPU, and if that rank also does PP work that is the only
+         * rank. So all coordinates are always transferred.
+         *
+         * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+         *       does not run on the GPU).
+         *
+         * \todo Make CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make a Context object visible in CPU parts of the code so we
+         *       will not have to pass a void*.
+         *
+         *  \param[in] commandStream  GPU stream, nullptr allowed.
+         *  \param[in] gpuContext     GPU context, nullptr allowed.
+         *  \param[in] transferKind   H2D/D2H transfer call behavior (synchronous or not).
+         *  \param[in] paddingSize    Padding size for coordinates buffer.
+         */
+        Impl(const void        *commandStream,
+             const void        *gpuContext,
+             GpuApiCallBehavior transferKind,
+             int                paddingSize);
+
+        ~Impl();
+
+
+        /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+         *
+         * The coordinates buffer is reallocated with the padding added at the end. The
+         * size of padding is set by the constructor.
+         *
+         *  \param[in] numAtomsLocal  Number of atoms in local domain.
+         *  \param[in] numAtomsAll    Total number of atoms to handle.
+         */
+        void reinit(int numAtomsLocal, int numAtomsAll);
+
+        /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+         *
+         * \todo There are at least three versions of the function with this functionality in the code:
+         *       this one and two more in NBNXM. These should be unified in a shape of a general function
+         *       in DD.
+         *
+         * \param[in]  atomLocality    If all, local or non-local ranges are needed.
+         *
+         * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+         */
+        std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality  atomLocality);
+
+
+        /*! \brief Get the positions buffer on the GPU.
+         *
+         *  \returns GPU positions buffer.
+         */
+        DeviceBuffer<float> getCoordinates();
+
+        /*! \brief Copy positions to the GPU memory.
+         *
+         *  \param[in] h_x           Positions in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
+                                  AtomLocality                    atomLocality);
+
+        /*! \brief Copy positions from the GPU memory.
+         *
+         *  \param[in] h_x           Positions buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                    AtomLocality              atomLocality);
+
+
+        /*! \brief Get the velocities buffer on the GPU.
+         *
+         *  \returns GPU velocities buffer.
+         */
+        DeviceBuffer<float> getVelocities();
+
+        /*! \brief Copy velocities to the GPU memory.
+         *
+         *  \param[in] h_v           Velocities in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
+                                 AtomLocality                    atomLocality);
+
+        /*! \brief Copy velocities from the GPU memory.
+         *
+         *  \param[in] h_v           Velocities buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                   AtomLocality              atomLocality);
+
+
+        /*! \brief Get the force buffer on the GPU.
+         *
+         *  \returns GPU force buffer.
+         */
+        DeviceBuffer<float> getForces();
+
+        /*! \brief Copy forces to the GPU memory.
+         *
+         *  \param[in] h_f           Forces in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
+                             AtomLocality                    atomLocality);
+
+        /*! \brief Copy forces from the GPU memory.
+         *
+         *  \param[in] h_f           Forces buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                               AtomLocality              atomLocality);
+
+        /*! \brief Synchronize the underlying GPU stream
+         */
+        void synchronizeStream();
+
+        /*! \brief Getter for the number of local atoms.
+         *
+         *  \returns The number of local atoms.
+         */
+        int numAtomsLocal();
+
+        /*! \brief Getter for the total number of atoms.
+         *
+         *  \returns The total number of atoms.
+         */
+        int numAtomsAll();
+
+    private:
+
+        /*! \brief GPU stream.
+         * \todo The stream should be set to non-nullptr once the synchronization points are restored
+         */
+        CommandStream        commandStream_              = nullptr;
+        /*! \brief GPU context (for OpenCL builds)
+         * \todo Make a Context class usable in CPU code
+         */
+        Context              gpuContext_                 = nullptr;
+        //! Default GPU calls behavior
+        GpuApiCallBehavior   transferKind_               = GpuApiCallBehavior::Async;
+        //! Padding size for the coordinates buffer
+        int                  paddingSize_                = 0;
+
+        //! Number of local atoms
+        int                  numAtomsLocal_              = -1;
+        //! Total number of atoms
+        int                  numAtomsAll_                = -1;
+
+        //! Device positions buffer
+        DeviceBuffer<float>  d_x_;
+        //! Number of particles saved in the positions buffer
+        int                  d_xSize_                    = -1;
+        //! Allocation size for the positions buffer
+        int                  d_xCapacity_                = -1;
+
+        //! Device velocities buffer
+        DeviceBuffer<float>  d_v_;
+        //! Number of particles saved in the velocities buffer
+        int                  d_vSize_                    = -1;
+        //! Allocation size for the velocities buffer
+        int                  d_vCapacity_                = -1;
+
+        //! Device force buffer
+        DeviceBuffer<float>  d_f_;
+        //! Number of particles saved in the force buffer
+        int                  d_fSize_                    = -1;
+        //! Allocation size for the force buffer
+        int                  d_fCapacity_                = -1;
+
+        /*! \brief Performs the copy of data from host to device buffer.
+         *
+         * \todo Template on locality.
+         *
+         * \param[in,out]  d_data        Device-side buffer.
+         * \param[in,out]  h_data        Host-side buffer.
+         * \param[in]      dataSize      Device-side data allocation size.
+         * \param[in]      atomLocality  If all, local or non-local ranges should be copied.
+         */
+        void copyToDevice(DeviceBuffer<float>                   d_data,
+                          const gmx::ArrayRef<const gmx::RVec>  h_data,
+                          int                                   dataSize,
+                          AtomLocality                          atomLocality);
+
+        /*! \brief Performs the copy of data from device to host buffer.
+         *
+         * \param[in,out]  h_data        Host-side buffer.
+         * \param[in,out]  d_data        Device-side buffer.
+         * \param[in]      dataSize      Device-side data allocation size.
+         * \param[in]      atomLocality  If all, local or non-local ranges should be copied.
+         */
+        void copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_data,
+                            DeviceBuffer<float>       d_data,
+                            int                       dataSize,
+                            AtomLocality              atomLocality);
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
new file mode 100644
index 0000000000..d2c1e5d54f
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -0,0 +1,381 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Definitions of interfaces for GPU state data propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#if GMX_GPU != GMX_GPU_NONE
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/cudautils.cuh"
+#endif
+#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/oclutils.h"
+#endif
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+#include "state_propagator_data_gpu_impl.h"
+
+namespace gmx
+{
+
+StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream,
+                                   gmx_unused const void *gpuContext,
+                                   GpuApiCallBehavior     transferKind,
+                                   int                    paddingSize) :
+    transferKind_(transferKind),
+    paddingSize_(paddingSize)
+{
+
+    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
+
+    // Set the stream-context pair for the OpenCL builds,
+    // use the nullptr stream for CUDA builds
+#if GMX_GPU == GMX_GPU_OPENCL
+    if (commandStream != nullptr)
+    {
+        commandStream_ = *static_cast<const CommandStream*>(commandStream);
+    }
+    if (gpuContext != nullptr)
+    {
+        gpuContext_ = *static_cast<const Context*>(gpuContext);
+    }
+#endif
+
+}
+
+StatePropagatorDataGpu::Impl::~Impl()
+{
+}
+
+void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
+{
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+    numAtomsLocal_ = numAtomsLocal;
+    numAtomsAll_   = numAtomsAll;
+
+    int numAtomsPadded;
+    if (paddingSize_ > 0)
+    {
+        numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1 ) / paddingSize_ )*paddingSize_;
+    }
+    else
+    {
+        numAtomsPadded = numAtomsAll_;
+    }
+
+    reallocateDeviceBuffer(&d_x_, DIM*numAtomsPadded, &d_xSize_, &d_xCapacity_, gpuContext_);
+
+    const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
+    if (paddingAllocationSize > 0)
+    {
+        clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_);
+    }
+
+    reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, gpuContext_);
+    reallocateDeviceBuffer(&d_f_, DIM*numAtomsAll_, &d_fSize_, &d_fCapacity_, gpuContext_);
+
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality  atomLocality)
+{
+    int atomsStartAt   = 0;
+    int numAtomsToCopy = 0;
+    switch (atomLocality)
+    {
+        case AtomLocality::All:
+            atomsStartAt    = 0;
+            numAtomsToCopy  = numAtomsAll_;
+            break;
+        case AtomLocality::Local:
+            atomsStartAt    = 0;
+            numAtomsToCopy  = numAtomsLocal_;
+            break;
+        case AtomLocality::NonLocal:
+            atomsStartAt    = numAtomsLocal_;
+            numAtomsToCopy  = numAtomsAll_ - numAtomsLocal_;
+            break;
+        default:
+            GMX_RELEASE_ASSERT(false, "Wrong range of atoms requested in GPU state data manager. Should be All, Local or NonLocal.");
+    }
+    GMX_ASSERT(atomsStartAt   >= 0, "The first elemtnt to copy has negative index. Probably, the GPU propagator state was not initialized.");
+    GMX_ASSERT(numAtomsToCopy >= 0, "Number of atoms to copy is negative. Probably, the GPU propagator state was not initialized.");
+    return std::make_tuple(atomsStartAt, numAtomsToCopy);
+}
+
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                   d_data,
+                                                const gmx::ArrayRef<const gmx::RVec>  h_data,
+                                                int                                   dataSize,
+                                                AtomLocality                          atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
+
+    int atomsStartAt, numAtomsToCopy;
+    std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+    int elementsStartAt   = atomsStartAt*DIM;
+    int numElementsToCopy = numAtomsToCopy*DIM;
+
+    if (numAtomsToCopy != 0)
+    {
+        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+        // TODO: Use the proper stream
+        copyToDeviceBuffer(&d_data, reinterpret_cast<const float *>(&h_data.data()[atomsStartAt]),
+                           elementsStartAt, numElementsToCopy,
+                           commandStream_, transferKind_, nullptr);
+    }
+}
+
+void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_data,
+                                                  DeviceBuffer<float>       d_data,
+                                                  int                       dataSize,
+                                                  AtomLocality              atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
+
+    int atomsStartAt, numAtomsToCopy;
+    std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+    int elementsStartAt   = atomsStartAt*DIM;
+    int numElementsToCopy = numAtomsToCopy*DIM;
+
+    if (numAtomsToCopy != 0)
+    {
+        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+        // TODO: Use the proper stream
+        copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
+                             elementsStartAt, numElementsToCopy,
+                             commandStream_, transferKind_, nullptr);
+
+    }
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+{
+    return d_x_;
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_x,
+                                                        AtomLocality                          atomLocality)
+{
+    copyToDevice(d_x_, h_x, d_xSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                                          AtomLocality              atomLocality)
+{
+    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+{
+    return d_v_;
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_v,
+                                                       AtomLocality                          atomLocality)
+{
+    copyToDevice(d_v_, h_v, d_vSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                                         AtomLocality              atomLocality)
+{
+    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+{
+    return d_f_;
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_f,
+                                                   AtomLocality                          atomLocality)
+{
+    copyToDevice(d_f_, h_f, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                                                     AtomLocality              atomLocality)
+{
+    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::synchronizeStream()
+{
+    gpuStreamSynchronize(commandStream_);
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsLocal()
+{
+    return numAtomsLocal_;
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsAll()
+{
+    return numAtomsAll_;
+}
+
+
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void        *commandStream,
+                                               const void        *gpuContext,
+                                               GpuApiCallBehavior transferKind,
+                                               int                paddingSize)
+    : impl_(new Impl(commandStream,
+                     gpuContext,
+                     transferKind,
+                     paddingSize))
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
+
+
+void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
+{
+    return impl_->reinit(numAtomsLocal, numAtomsAll);
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality  atomLocality)
+{
+    return impl_->getAtomRangesFromAtomLocality(atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+    return impl_->getCoordinates();
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_x,
+                                                  AtomLocality                          atomLocality)
+{
+    return impl_->copyCoordinatesToGpu(h_x, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>  h_x,
+                                                    AtomLocality         atomLocality)
+{
+    return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+    return impl_->getVelocities();
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_v,
+                                                 AtomLocality                          atomLocality)
+{
+    return impl_->copyVelocitiesToGpu(h_v, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec>  h_v,
+                                                   AtomLocality         atomLocality)
+{
+    return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+    return impl_->getForces();
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_f,
+                                             AtomLocality                          atomLocality)
+{
+    return impl_->copyForcesToGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec>  h_f,
+                                               AtomLocality         atomLocality)
+{
+    return impl_->copyForcesFromGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+    return impl_->synchronizeStream();
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+    return impl_->numAtomsLocal();
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+    return impl_->numAtomsAll();
+}
+
+}      // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index f6df6eb6b4..5df9633e27 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1084,42 +1084,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
     }
 }
 
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet     &gridSet,
-                                  const Nbnxm::AtomLocality locality,
-                                  bool                      fillLocal,
-                                  nbnxn_atomdata_t         *nbat,
-                                  gmx_nbnxn_gpu_t          *gpu_nbv,
-                                  const rvec               *coordinatesHost)
-{
-    int gridBegin = 0;
-    int gridEnd   = 0;
-    getAtomRanges(gridSet, locality, &gridBegin, &gridEnd);
-
-    if (fillLocal)
-    {
-        nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
-    }
-
-    for (int g = gridBegin; g < gridEnd; g++)
-    {
-        nbnxn_gpu_copy_x_to_gpu(gridSet.grids()[g],
-                                gpu_nbv,
-                                locality,
-                                coordinatesHost);
-    }
-}
-
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return Nbnxm::nbnxn_gpu_get_x_gpu(gpu_nbv);
-}
-
 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
 void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                     const Nbnxm::AtomLocality locality,
                                     bool                      fillLocal,
                                     gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       coordinatesDevice)
+                                    DeviceBuffer<float>       d_x)
 {
 
     int gridBegin = 0;
@@ -1131,7 +1101,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
         nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
                               fillLocal && g == 0,
                               gpu_nbv,
-                              coordinatesDevice,
+                              d_x,
                               locality,
                               g,
                               gridSet.numColumnsMax());
@@ -1552,11 +1522,6 @@ void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
                                      accumulateForce);
 }
 
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv);
-}
-
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                               gmx::ArrayRef<gmx::RVec>  fshift)
 {
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index ba644bd436..412d328bce 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -58,8 +58,6 @@ struct nonbonded_verlet_t;
 struct t_mdatoms;
 struct tMPI_Atomic;
 
-enum class BufferOpsUseGpu;
-
 class GpuEventSynchronizer;
 
 namespace Nbnxm
@@ -326,46 +324,22 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet       &gridSet,
                                      const rvec                 *coordinates,
                                      nbnxn_atomdata_t           *nbat);
 
-/*! \brief Copies the coordinates to the GPU (in plain rvec format)
- *
- *  This function copied data to the gpu so that the transformation to the NBNXM format can be done on the GPU.
- *
- * \param[in] gridSet          The grids data.
- * \param[in] locality         If local or non local coordinates should be copied.
- * \param[in] fillLocal        If the local filler particle coordinates should be zeroed.
- * \param[in] nbat             Data in NBNXM format, used to zero coordinates of filler particles.
- * \param[in] gpu_nbv          The NBNXM GPU data structure.
- * \param[in] coordinatesHost  Coordinates to be copied (in plain rvec format).
- */
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet     &gridSet,
-                                  Nbnxm::AtomLocality       locality,
-                                  bool                      fillLocal,
-                                  nbnxn_atomdata_t         *nbat,
-                                  gmx_nbnxn_gpu_t          *gpu_nbv,
-                                  const rvec               *coordinatesHost);
-
-/*!\brief Getter for the GPU coordinates buffer
- *
- * \param[in] gpu_nbv  The NBNXM GPU data structure.
- */
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
 /*! \brief Transform coordinates to xbat layout on GPU
  *
  * Creates a GPU copy of the coordinates buffer using short-range ordering.
  * As input, uses coordinates in plain rvec format in GPU memory.
  *
- * \param[in]     gridSet            The grids data.
- * \param[in]     locality           If the transformation should be applied to local or non local coordinates.
- * \param[in]     fillLocal          Tells if the local filler particle coordinates should be zeroed.
- * \param[in,out] gpu_nbv            The NBNXM GPU data structure.
- * \param[in]     coordinatesDevice  Coordinates to be copied (in plain rvec format).
+ * \param[in]     gridSet    The grids data.
+ * \param[in]     locality   If the transformation should be applied to local or non local coordinates.
+ * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
+ * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
+ * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
  */
 void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                     Nbnxm::AtomLocality       locality,
                                     bool                      fillLocal,
                                     gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       coordinatesDevice);
+                                    DeviceBuffer<float>       d_x);
 
 /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
  *
@@ -399,16 +373,6 @@ void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
                      bool                                useGpuFPmeReduction,
                      bool                                accumulateForce);
 
-/*!\brief Getter for the GPU forces buffer
- *
- * \todo Will be removed when the buffer management is lifted out of the NBNXM
- *
- * \param[in] gpu_nbv  The NBNXM GPU data structure.
- *
- * \returns Device forces buffer
- */
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
 /* Add the fshift force stored in nbat to fshift */
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                               gmx::ArrayRef<gmx::RVec>  fshift);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 10a6f0a0d3..dba1ca8c82 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -743,60 +743,11 @@ void cuda_set_cacheconfig()
     }
 }
 
-/* X buffer operations on GPU: copies coordinates to the GPU in rvec format. */
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid               &grid,
-                             gmx_nbnxn_gpu_t                 *nb,
-                             const Nbnxm::AtomLocality        locality,
-                             const rvec                      *coordinatesHost)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    bool                       bDoTime = nb->bDoTime;
-
-    Nbnxm::InteractionLocality interactionLoc            = gpuAtomToInteractionLocality(locality);
-    int                        numCopyAtoms              = grid.srcAtomEnd() - grid.srcAtomBegin();
-    int                        copyAtomStart             = grid.srcAtomBegin();
-
-    cudaStream_t               stream  = nb->stream[interactionLoc];
-
-    // empty domain avoid launching zero-byte copy
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(coordinatesHost,  "Need a valid host pointer");
-
-    if (bDoTime)
-    {
-        nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
-    }
-
-    rvec       *devicePtrDest = reinterpret_cast<rvec *> (nb->xrvec[copyAtomStart]);
-    const rvec *devicePtrSrc  = reinterpret_cast<const rvec *> (coordinatesHost[copyAtomStart]);
-    copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
-                       stream, GpuApiCallBehavior::Async, nullptr);
-
-    if (interactionLoc == Nbnxm::InteractionLocality::Local)
-    {
-        nb->xAvailableOnDevice->markEvent(stream);
-    }
-
-    if (bDoTime)
-    {
-        nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
-    }
-}
-
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t *nb)
-{
-    return reinterpret_cast< DeviceBuffer<float> >(nb->xrvec);
-}
-
 /* X buffer operations on GPU: performs conversion from rvec to nb format. */
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            bool                             setFillerCoords,
                            gmx_nbnxn_gpu_t                 *nb,
-                           DeviceBuffer<float>              coordinatesDevice,
+                           DeviceBuffer<float>              d_x,
                            const Nbnxm::AtomLocality        locality,
                            int                              gridId,
                            int                              numColumnsMax)
@@ -817,7 +768,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     if (numAtoms != 0)
     {
         // TODO: This will only work with CUDA
-        GMX_ASSERT(coordinatesDevice, "Need a valid device pointer");
+        GMX_ASSERT(d_x, "Need a valid device pointer");
 
         KernelLaunchConfig config;
         config.blockSize[0]     = c_bufOpsThreadsPerBlock;
@@ -839,7 +790,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                                                               &numColumns,
                                                               &xqPtr,
                                                               &setFillerCoords,
-                                                              &coordinatesDevice,
+                                                              &d_x,
                                                               &d_atomIndices,
                                                               &d_cxy_na,
                                                               &d_cxy_ind,
@@ -920,142 +871,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
 
 }
 
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
-{
-    return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
-}
-
-void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
-                                const Nbnxm::GridSet            &gridSet,
-                                gmx_nbnxn_gpu_t                 *nb,
-                                rvec                            *f)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-
-    int                       atomStart = 0, numCopyAtoms = 0;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
-    // Avoiding launching copy with no work
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(f, "Need a valid f pointer");
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
-    }
-
-    rvec       *ptrDest  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
-    rvec       *ptrSrc   = reinterpret_cast<rvec *> (f[atomStart]);
-    //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, numCopyAtoms,
-    //                   stream, GpuApiCallBehavior::Async, nullptr);
-    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
-    cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
-                    stream);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
-void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
-                                  const Nbnxm::GridSet            &gridSet,
-                                  gmx_nbnxn_gpu_t                 *nb,
-                                  rvec                            *f)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-    int                       atomStart, numCopyAtoms;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
-    // Avoiding launching copy with no work
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(f, "Need a valid f pointer");
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
-    }
-
-    GMX_ASSERT(nb->frvec,  "Need a valid nb->frvec pointer");
-    rvec       *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
-    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
-    //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, numCopyAtoms,
-    //                   stream, GpuApiCallBehavior::Async, nullptr);
-    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
-    cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
-                    stream);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
-void nbnxn_launch_copy_x_from_gpu(const AtomLocality               atomLocality,
-                                  const Nbnxm::GridSet            &gridSet,
-                                  gmx_nbnxn_gpu_t                 *nb,
-                                  rvec                            *x)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-    GMX_ASSERT(x,  "Need a valid x pointer");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-    int                       atomStart, nAtoms;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
-    }
-
-    GMX_ASSERT(nb->xrvec,  "Need a valid nb->xrvec pointer");
-    rvec       *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
-    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
-    copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
-                         stream, GpuApiCallBehavior::Async, stream);
-
-    if (atomLocality == AtomLocality::NonLocal)
-    {
-        nb->xNonLocalCopyD2HDone->markEvent(stream);
-    }
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
 void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
                                         gmx_nbnxn_gpu_t                   *nb)
 {
@@ -1069,16 +884,6 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomL
 
 }
 
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return static_cast<void *> (gpu_nbv->xrvec);
-}
-
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return static_cast<void *> (gpu_nbv->frvec);
-}
-
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
 {
     return static_cast<void*> (nb->xAvailableOnDevice);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 1df63f01db..3641d5eb5f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -502,16 +502,12 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
 
     cuda_init_const(nb, ic, listParams, nbat->params());
 
-    nb->natoms                   = 0;
-    nb->natoms_alloc             = 0;
     nb->atomIndicesSize          = 0;
     nb->atomIndicesSize_alloc    = 0;
     nb->ncxy_na                  = 0;
     nb->ncxy_na_alloc            = 0;
     nb->ncxy_ind                 = 0;
     nb->ncxy_ind_alloc           = 0;
-    nb->nfrvec                   = 0;
-    nb->nfrvec_alloc             = 0;
     nb->ncell                    = 0;
     nb->ncell_alloc              = 0;
 
@@ -903,10 +899,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
         const int           atomIndicesSize   = gridSet.atomIndices().size();
         const int          *cxy_na            = grid.cxy_na().data();
         const int          *cxy_ind           = grid.cxy_ind().data();
-        // TODO Should be done once per gridset
-        const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
 
-        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
         reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
 
         if (atomIndicesSize > 0)
@@ -977,8 +970,6 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
 
     cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
 
-    reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
-
     if (natoms_total > 0)
     {
         reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index ec4b2b8d33..f3fc0e8852 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -219,18 +219,6 @@ struct gmx_nbnxn_cuda_t
     bool                                                            bUseTwoStreams;
     //! atom data
     cu_atomdata_t                                                  *atdat;
-    //! coordinates in rvec format
-    rvec                                                           *xrvec;
-    //! number of atoms
-    int                                                             natoms;
-    //! number of atoms allocated in device buffer
-    int                                                             natoms_alloc;
-    //! force in rvec format
-    rvec                                                           *frvec;
-    //! number of atoms in force buffer
-    int                                                             nfrvec;
-    //! number of atoms allocated in force buffer
-    int                                                             nfrvec_alloc;
     //! f buf ops cell index mapping
     int                                                            *cell;
     //! number of indices in cell buffer
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 892098ae4a..5c6967bb65 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -147,37 +147,16 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality       loca
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
-
-void nonbonded_verlet_t::copyCoordinatesToGpu(const Nbnxm::AtomLocality       locality,
-                                              const bool                      fillLocal,
-                                              gmx::ArrayRef<const gmx::RVec>  coordinatesHost)
-{
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
-
-    nbnxn_atomdata_copy_x_to_gpu(pairSearch_->gridSet(), locality, fillLocal,
-                                 nbat.get(), gpu_nbv,
-                                 as_rvec_array(coordinatesHost.data()));
-
-    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
-}
-
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceCoordinates()
-{
-    return nbnxn_atomdata_get_x_gpu(gpu_nbv);
-}
-
 void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality       locality,
                                                const bool                      fillLocal,
-                                               DeviceBuffer<float>             coordinatesDevice)
+                                               DeviceBuffer<float>             d_x)
 {
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
 
     nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                    gpu_nbv,
-                                   coordinatesDevice);
+                                   d_x);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
@@ -254,11 +233,6 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceForces()
-{
-    return nbnxn_atomdata_get_f_gpu(gpu_nbv);
-}
-
 real nonbonded_verlet_t::pairlistInnerRadius() const
 {
     return pairlistSets_->params().rlistInner;
@@ -286,40 +260,11 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 
-void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_f_to_gpu(locality,
-                               pairSearch_->gridSet(),
-                               gpu_nbv,
-                               f);
-}
-
-void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_f_from_gpu(locality,
-                                 pairSearch_->gridSet(),
-                                 gpu_nbv,
-                                 f);
-}
-
-void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_x_from_gpu(locality,
-                                 pairSearch_->gridSet(),
-                                 gpu_nbv,
-                                 x);
-}
-
 void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
 {
     nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
 }
 
-void* nonbonded_verlet_t::get_gpu_xrvec()
-{
-    return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
-}
-
 void* nonbonded_verlet_t::get_x_on_device_event()
 {
     return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
@@ -330,11 +275,6 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
     Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
 }
 
-void* nonbonded_verlet_t::get_gpu_frvec()
-{
-    return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
-}
-
 void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
 {
     Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index c6d12d2e87..991f21692e 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -264,42 +264,17 @@ struct nonbonded_verlet_t
                                 bool                            fillLocal,
                                 gmx::ArrayRef<const gmx::RVec>  coordinates);
 
-        /*!\brief Copy coordinates to the GPU memory.
-         *
-         * This function uses the internal NBNXM GPU pointer to copy coordinates in the plain rvec format
-         * into the GPU memory.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \param[in] locality         Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal        If the coordinates for filler particles should be zeroed.
-         * \param[in] coordinatesHost  Coordinates in plain rvec format to be transformed.
-         */
-        void copyCoordinatesToGpu(Nbnxm::AtomLocality             locality,
-                                  bool                            fillLocal,
-                                  gmx::ArrayRef<const gmx::RVec>  coordinatesHost);
-
-        /*!\brief Getter for the GPU coordinates buffer.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \returns The coordinates buffer in plain rvec format.
-         */
-        DeviceBuffer<float> getDeviceCoordinates();
-
         /*!\brief Convert the coordinates to NBNXM format on the GPU for the given locality
          *
          * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
          *
-         * \param[in] locality           Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal          If the coordinates for filler particles should be zeroed.
-         * \param[in] coordinatesDevice  GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] locality   Whether coordinates for local or non-local atoms should be transformed.
+         * \param[in] fillLocal  If the coordinates for filler particles should be zeroed.
+         * \param[in] d_x        GPU coordinates buffer in plain rvec format to be transformed.
          */
         void convertCoordinatesGpu(Nbnxm::AtomLocality             locality,
                                    bool                            fillLocal,
-                                   DeviceBuffer<float>             coordinatesDevice);
+                                   DeviceBuffer<float>             d_x);
 
         //! Init for GPU version of setup coordinates in Nbnxm
         void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -370,36 +345,15 @@ struct nonbonded_verlet_t
                                           bool                                useGpuFPmeReduction,
                                           bool                                accumulateForce);
 
-        /*!\brief Getter for the GPU force buffer.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \returns The force buffer in plain rvec format.
-         */
-        DeviceBuffer<float> getDeviceForces();
-
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
 
-        /*! \brief H2D transfer of force buffer*/
-        void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
-        /*! \brief D2H transfer of force buffer*/
-        void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
-        /*! \brief D2H transfer of coordinate buffer*/
-        void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
         /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
          *
          * FIXME: need more details: when should be called / after which operation, etc.
          */
         void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
 
-        /*! \brief return GPU pointer to x in rvec format */
-        void* get_gpu_xrvec();
-
         /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
         void* get_x_on_device_event();
 
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index f300281802..f661a19ca1 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -229,48 +229,21 @@ CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
                                 gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM;
 
-/*! \brief Copy coordinates from host to device memory.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  *
- * \param[in]     grid             Grid to be copied.
+ * \param[in]     grid             Grid to be converted.
+ * \param[in]     setFillerCoords  If the filler coordinates are used.
  * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
+ * \param[in]     d_x              Device-side coordinates in plain rvec format.
  * \param[in]     locality         Copy coordinates for local or non-local atoms.
- * \param[in]     coordinatesHost  Host-side coordinates in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid   gmx_unused &grid,
-                             gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                             Nbnxm::AtomLocality gmx_unused  locality,
-                             const rvec          gmx_unused *coordinatesHost) CUDA_FUNC_TERM;
-
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in]  gpu_nbv  The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-
-/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
- *
- * \param[in]     grid               Grid to be converted.
- * \param[in]     setFillerCoords    If the filler coordinates are used.
- * \param[in,out] gpu_nbv            The nonbonded data GPU structure.
- * \param[in]     coordinatesDevice  Device-side coordinates in plain rvec format.
- * \param[in]     locality           Copy coordinates for local or non-local atoms.
- * \param[in]     gridId             Index of the grid being converted.
- * \param[in]     numColumnsMax      Maximum number of columns in the grid.
+ * \param[in]     gridId           Index of the grid being converted.
+ * \param[in]     numColumnsMax    Maximum number of columns in the grid.
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
                            bool                gmx_unused  setFillerCoords,
                            gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                           DeviceBuffer<float> gmx_unused  coordinatesDevice,
+                           DeviceBuffer<float> gmx_unused  d_x,
                            Nbnxm::AtomLocality gmx_unused  locality,
                            int                 gmx_unused  gridId,
                            int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
@@ -346,43 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLoca
                                bool                         gmx_unused  useGpuFPmeReduction,
                                bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in]  gpu_nbv  The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-/*! \brief Copy force buffer from CPU to GPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_to_gpu(AtomLocality            gmx_unused  atomLocality,
-                                const Nbnxm::GridSet    gmx_unused &gridSet,
-                                gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                rvec                    gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Copy force buffer from GPU to CPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocality,
-                                  const Nbnxm::GridSet    gmx_unused &gridSet,
-                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                  rvec                    gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
- * \param[in]  atomLocality  Locality for data trasnfer
- * \param[in]  gridSet       The Grid Set data object
- * \param[in]  nb            The nonbonded data GPU structure
- * \param[out] x             Coordinate buffer on CPU
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
-                                  const Nbnxm::GridSet    gmx_unused &gridSet,
-                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
-
 /*! \brief Wait for GPU stream to complete */
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
@@ -400,24 +336,12 @@ void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 CUDA_FUNC_QUALIFIER
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief return GPU pointer to x in rvec format
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Wait for non-local copy of coordinate buffer from device to host
  * \param[in] nb                   The nonbonded data GPU structure
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
-/*! \brief return GPU pointer to f in rvec format
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Ensure local stream waits for non-local stream
  * \param[in] nb                   The nonbonded data GPU structure
  */
-- 
2.22.0