From af60a72fe677d8fe5de06c2e5dbd8f4d129df0ba Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Wed, 10 Mar 2021 09:06:41 +0000
Subject: [PATCH] Revert "Wrap more device pointers in DeviceBuffer" (!1244)

This reverts commit 850429f3ebe34d27dabed6d8c31c08968befd1f5,
which broke some MPI functionality, as revealed in post-merge tests.
---
 src/gromacs/ewald/pme.h                       |  6 +--
 src/gromacs/ewald/pme_force_sender_gpu.h      |  3 +-
 .../ewald/pme_force_sender_gpu_impl.cpp       |  2 +-
 .../ewald/pme_force_sender_gpu_impl.cu        |  5 +--
 src/gromacs/ewald/pme_force_sender_gpu_impl.h |  3 +-
 src/gromacs/ewald/pme_gather.cu               |  2 +-
 src/gromacs/ewald/pme_gpu.cpp                 |  4 +-
 src/gromacs/ewald/pme_gpu_internal.cpp        | 16 ++++----
 src/gromacs/ewald/pme_gpu_internal.h          |  4 +-
 src/gromacs/ewald/pme_gpu_types.h             |  2 +-
 src/gromacs/ewald/pme_only.cpp                | 12 ++----
 src/gromacs/ewald/pme_pp_comm_gpu.h           |  3 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp    |  4 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |  6 +--
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |  9 ++---
 src/gromacs/mdlib/gpuforcereduction.h         |  8 ++--
 src/gromacs/mdlib/gpuforcereduction_impl.cpp  |  4 +-
 src/gromacs/mdlib/gpuforcereduction_impl.cu   | 39 +++++++++----------
 ...tion_impl.h => gpuforcereduction_impl.cuh} | 16 ++++----
 src/gromacs/mdlib/sim_util.cpp                |  7 ++--
 .../mdlib/update_constrain_gpu_impl.cu        |  1 -
 src/gromacs/mdlib/update_constrain_gpu_impl.h | 30 ++++++--------
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  2 +-
 src/gromacs/nbnxm/nbnxm.cpp                   |  2 +-
 src/gromacs/nbnxm/nbnxm.h                     |  2 +-
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  3 +-
 26 files changed, 88 insertions(+), 107 deletions(-)
 rename src/gromacs/mdlib/{gpuforcereduction_impl.h => gpuforcereduction_impl.cuh} (93%)
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index d4a591b7bf..3f2d33f6c6 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
- * \returns                  Pointer to synchronizer
+ * \returns                  Pointer to sychronizer
  */
 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h
index ec88e8569b..bcc3b1e393 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -45,7 +45,6 @@
 #include <memory>
 
 #include "gromacs/math/vectypes.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceStream;
@@ -84,7 +83,7 @@ public:
      * Initialization of GPU PME Force sender
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
      * Send PP data to PP rank
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
index 88cc5ca11c..915d0953ed 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -75,7 +75,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<gmx::RVec> /* d_f */)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index 972c1c7d57..07d37dcd7e 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -48,7 +48,6 @@
 #include "config.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -69,7 +68,7 @@ PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx:
 PmeForceSenderGpu::Impl::~Impl() = default;
 
 /*! \brief  sends force buffer address to PP ranks */
-void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
 {
     int ind_start = 0;
     int ind_end   = 0;
@@ -116,7 +115,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
 
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f)
 {
     impl_->sendForceBufferAddressToPpRanks(d_f);
 }
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index f0b0f74bad..70be40cc7f 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -44,7 +44,6 @@
 #define GMX_PMEFORCESENDERGPU_IMPL_H
 
 #include "gromacs/ewald/pme_force_sender_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/arrayref.h"
 
@@ -69,7 +68,7 @@ public:
      * sends force buffer address to PP rank
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
      * Send PP data to PP rank
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 82eeed85a1..35111b52ec 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -351,7 +351,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
-    float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
+    float* __restrict__ gm_forces              = kernelParams.atoms.d_forces;
 
     /* Global memory pointers for readGlobal */
     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 225fb1050a..e0c32e207d 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
-DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
+void* pme_gpu_get_device_f(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
     {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
     }
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index c0d422d572..028a66a35a 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -229,7 +229,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
 
 void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
 {
-    const size_t newForcesSize = pmeGpu->nAtomsAlloc;
+    const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
                            newForcesSize,
@@ -248,10 +248,11 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu)
 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
 {
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
-                       pmeGpu->staging.h_forces.data(),
+                       h_forcesFloat,
                        0,
-                       pmeGpu->kernelParams->atoms.nAtoms,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream_,
                        pmeGpu->settings.transferKind,
                        nullptr);
@@ -260,10 +261,11 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
 {
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
-    copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(),
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
+    copyFromDeviceBuffer(h_forcesFloat,
                          &pmeGpu->kernelParams->atoms.d_forces,
                          0,
-                         pmeGpu->kernelParams->atoms.nAtoms,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms,
                          pmeGpu->archSpecific->pmeStream_,
                          pmeGpu->settings.transferKind,
                          nullptr);
@@ -1704,7 +1706,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
     }
 }
 
-DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
     {
@@ -1712,7 +1714,7 @@ DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
     else
     {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
     }
 }
 
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index 7baa6bd347..41b912e2b6 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h
index e2c067390a..abf7a17ed0 100644
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -171,7 +171,7 @@ struct PmeGpuAtomParams
      * The forces change and need to be copied from (and possibly to) the GPU for every PME
      * computation, but reallocation happens only at DD.
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_forces;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_forces;
     /*! \brief Global GPU memory array handle with ivec atom gridline indices.
      * Computed on GPU in the spline calculation part.
      */
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 5a01c898c8..c895975144 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -445,7 +445,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                     // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
                     pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
                             stateGpu->getCoordinates());
-                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme));
+                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
+                            reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
                 }
             }
 
@@ -582,13 +583,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
         if (pme_pp->useGpuDirectComm)
         {
             // Data will be transferred directly from GPU.
-            DeviceBuffer<gmx::RVec> gmx_unused d_f = pme_gpu_get_device_f(&pme);
-#    if GMX_GPU_CUDA
-            // OpenCL does not allow host-side pointer arithmetic on buffers. Neither does SYCL.
-            sendbuf = reinterpret_cast<void*>(&d_f[ind_start]);
-#    else
-            GMX_RELEASE_ASSERT(false, "Can only use GPU Direct Communications with CUDA");
-#    endif
+            rvec* d_f = reinterpret_cast<rvec*>(pme_gpu_get_device_f(&pme));
+            sendbuf   = reinterpret_cast<void*>(&d_f[ind_start]);
         }
         sendFToPP(sendbuf, receiver, pme_pp, &messages);
     }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h
index 886e0c221b..3e56da9af3 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -44,7 +44,6 @@
 
 #include <memory>
 
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
@@ -100,7 +99,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
index d31b976c10..c5f92aa53f 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
                "implementation.");
 }
 
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
-    return DeviceBuffer<gmx::RVec>{};
+    return nullptr;
 }
 
 GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 0d66883979..0ecf028133 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -155,9 +155,9 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
 }
-DeviceBuffer<gmx::RVec> PmePpCommGpu::Impl::getGpuForceStagingPtr()
+void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
-    return d_pmeForces_;
+    return static_cast<void*>(d_pmeForces_);
 }
 
 GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
@@ -194,7 +194,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
             sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 }
 
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
 {
     return impl_->getGpuForceStagingPtr();
 }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index 58e22c2e46..042891a04d 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -44,7 +44,6 @@
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
@@ -111,7 +110,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
@@ -124,15 +123,15 @@ private:
     //! Handle for CUDA stream used for the communication operations in this class
     const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
-    DeviceBuffer<gmx::RVec> remotePmeXBuffer_ = nullptr;
+    void* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
-    DeviceBuffer<gmx::RVec> remotePmeFBuffer_ = nullptr;
+    void* remotePmeFBuffer_ = nullptr;
     //! communicator for simulation
     MPI_Comm comm_;
     //! Rank of PME task
     int pmeRank_ = -1;
     //! Buffer for staging PME force on GPU
-    DeviceBuffer<gmx::RVec> d_pmeForces_ = nullptr;
+    rvec* d_pmeForces_ = nullptr;
     //! number of atoms in PME force staging array
     int d_pmeForcesSize_ = -1;
     //! number of atoms allocated in recvbuf array
diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h
index 82e8db33f5..157c4c7eca 100644
--- a/src/gromacs/mdlib/gpuforcereduction.h
+++ b/src/gromacs/mdlib/gpuforcereduction.h
@@ -86,13 +86,13 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+    void registerNbnxmForce(void* forcePtr);
 
     /*! \brief Register a rvec-format force to be reduced
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+    void registerRvecForce(void* forcePtr);
 
     /*! \brief Add a dependency for this force reduction
      *
@@ -102,14 +102,14 @@ public:
 
     /*! \brief Reinitialize the GPU force reduction
      *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
      * \param [in] numAtoms         The number of atoms
      * \param [in] cell             Pointer to the cell array
      * \param [in] atomStart        The start atom for the reduction
      * \param [in] accumulate       Whether reduction should be accumulated
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(DeviceBuffer<RVec>    baseForcePtr,
                 int                   numAtoms,
                 ArrayRef<const int>   cell,
                 int                   atomStart,
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
index b69759b848..1e0a30b2b6 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -76,13 +76,13 @@ void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu
index ac89b47d24..6e1e7e920a 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -43,7 +43,7 @@
 
 #include "gmxpre.h"
 
-#include "gpuforcereduction_impl.h"
+#include "gpuforcereduction_impl.cuh"
 
 #include <stdio.h>
 
@@ -112,15 +112,15 @@ GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
     deviceStream_(deviceStream),
     wcycle_(wcycle){};
 
-void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
-                                     const int               numAtoms,
-                                     ArrayRef<const int>     cell,
-                                     const int               atomStart,
-                                     const bool              accumulate,
-                                     GpuEventSynchronizer*   completionMarker)
+void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
 {
-    GMX_ASSERT((baseForce != nullptr), "Input base force for reduction has no data");
-    baseForce_        = baseForce;
+    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    baseForce_        = &(baseForcePtr[atomStart]);
     numAtoms_         = numAtoms;
     atomStart_        = atomStart;
     accumulate_       = accumulate;
@@ -144,13 +144,13 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
 
 void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
     nbnxmForceToAdd_ = forcePtr;
 };
 
 void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
     rvecForceToAdd_ = forcePtr;
 };
 
@@ -172,12 +172,11 @@ void GpuForceReduction::Impl::execute()
     GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
 
     // Enqueue wait on all dependencies passed
-    for (const auto& synchronizer : dependencyList_)
+    for (auto const synchronizer : dependencyList_)
     {
         synchronizer->enqueueWaitEvent(deviceStream_);
     }
 
-    float3* d_baseForce      = &(asFloat3(baseForce_)[atomStart_]);
     float3* d_nbnxmForce     = asFloat3(nbnxmForceToAdd_);
     float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
 
@@ -196,7 +195,7 @@ void GpuForceReduction::Impl::execute()
                             : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
 
     const auto kernelArgs = prepareGpuKernelArguments(
-            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_);
+            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &baseForce_, &cellInfo_.d_cell, &numAtoms_);
 
     launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
 
@@ -219,14 +218,14 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
 {
 }
 
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerNbnxmForce(void* forcePtr)
 {
-    impl_->registerNbnxmForce(std::move(forcePtr));
+    impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
 }
 
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerRvecForce(void* forcePtr)
 {
-    impl_->registerRvecForce(std::move(forcePtr));
+    impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
 }
 
 void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
@@ -234,14 +233,14 @@ void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
     impl_->addDependency(dependency);
 }
 
-void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForce,
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
                                const int             numAtoms,
                                ArrayRef<const int>   cell,
                                const int             atomStart,
                                const bool            accumulate,
                                GpuEventSynchronizer* completionMarker)
 {
-    impl_->reinit(baseForce, numAtoms, cell, atomStart, accumulate, completionMarker);
+    impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
 }
 void GpuForceReduction::execute()
 {
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
similarity index 93%
rename from src/gromacs/mdlib/gpuforcereduction_impl.h
rename to src/gromacs/mdlib/gpuforcereduction_impl.cuh
index 491fd95f21..bd222e40a6 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.h
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
@@ -58,7 +58,7 @@ struct cellInfo
     //! cell index mapping for any nbat-format forces
     const int* cell = nullptr;
     //! device copy of cell index mapping for any nbat-format forces
-    DeviceBuffer<int> d_cell;
+    int* d_cell = nullptr;
     //! number of atoms in cell array
     int cellSize = -1;
     //! number of atoms allocated in cell array
@@ -75,7 +75,7 @@ public:
      * \param [in] deviceContext GPU device context
      * \param [in] wcycle        The wallclock counter
      */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
     ~Impl();
 
     /*! \brief Register a nbnxm-format force to be reduced
@@ -98,14 +98,14 @@ public:
 
     /*! \brief Reinitialize the GPU force reduction
      *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
      * \param [in] numAtoms         The number of atoms
      * \param [in] cell             Pointer to the cell array
      * \param [in] atomStart        The start atom for the reduction
      * \param [in] accumulate       Whether reduction should be accumulated
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(float3*               baseForcePtr,
                 const int             numAtoms,
                 ArrayRef<const int>   cell,
                 const int             atomStart,
@@ -117,13 +117,13 @@ public:
 
 private:
     //! force to be used as a base for this reduction
-    DeviceBuffer<RVec> baseForce_;
+    float3* baseForce_ = nullptr;
     //! starting atom
     int atomStart_ = 0;
     //! number of atoms
     int numAtoms_ = 0;
     //! whether reduction is accumulated into base force buffer
-    bool accumulate_ = true;
+    int accumulate_ = true;
     //! cell information for any nbat-format forces
     struct cellInfo cellInfo_;
     //! GPU context object
@@ -133,9 +133,9 @@ private:
     //! stream to be used for this reduction
     const DeviceStream& deviceStream_;
     //! Nbnxm force to be added in this reduction
-    DeviceBuffer<RVec> nbnxmForceToAdd_;
+    DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
     //! Rvec-format force to be added in this reduction
-    DeviceBuffer<RVec> rvecForceToAdd_;
+    DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
     //! event to be marked when redcution launch has been completed
     GpuEventSynchronizer* completionMarker_ = nullptr;
     //! The wallclock counter
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index f309e82c26..0bcf0c5451 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1115,10 +1115,9 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     if (runScheduleWork->simulationWork.useGpuPme
         && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
     {
-        DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+                                                       : // PME force buffer on same GPU
+                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
         fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
 
         GpuEventSynchronizer* const pmeSynchronizer =
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index 831bbfb3c3..f03ab778c3 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -60,7 +60,6 @@
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 34712d4941..8e101b8cd0 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -46,22 +46,14 @@
 #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
 #define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
 
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/pbcutil/pbc_aiuc.h"
-
-#if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/gputraits.cuh"
-#endif
-
-class GpuEventSynchronizer;
-namespace gmx
-{
-class LincsGpu;
-class SettleGpu;
-class LeapFrogGpu;
-} // namespace gmx
 
 namespace gmx
 {
@@ -201,14 +193,14 @@ private:
     int numAtoms_;
 
     //! Local copy of the pointer to the device positions buffer
-    DeviceBuffer<float3> d_x_;
+    float3* d_x_;
     //! Local copy of the pointer to the device velocities buffer
-    DeviceBuffer<float3> d_v_;
+    float3* d_v_;
     //! Local copy of the pointer to the device forces buffer
-    DeviceBuffer<float3> d_f_;
+    float3* d_f_;
 
     //! Device buffer for intermediate positions (maintained internally)
-    DeviceBuffer<float3> d_xp_;
+    float3* d_xp_;
     //! Number of elements in shifted coordinates buffer
     int numXp_ = -1;
     //! Allocation size for the shifted coordinates buffer
@@ -216,7 +208,7 @@ private:
 
 
     //! 1/mass for all atoms (GPU)
-    DeviceBuffer<real> d_inverseMasses_;
+    real* d_inverseMasses_;
     //! Number of elements in reciprocal masses buffer
     int numInverseMasses_ = -1;
     //! Allocation size for the reciprocal masses buffer
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index e30f8e8c87..6ed6c1ff78 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -907,7 +907,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu* nb)
+void* getGpuForces(NbnxmGpu* nb)
 {
     return nb->atdat->f;
 }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index c96291a1c2..96714bb154 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -215,7 +215,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
     return numAtoms;
 }
 
-DeviceBuffer<gmx::RVec> nonbonded_verlet_t::getGpuForces() const
+void* nonbonded_verlet_t::getGpuForces() const
 {
     return Nbnxm::getGpuForces(gpu_nbv);
 }
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 9732b01a39..7ea59e4e5a 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -398,7 +398,7 @@ public:
      *
      * \returns A pointer to the force buffer in GPU memory
      */
-    DeviceBuffer<gmx::RVec> getGpuForces() const;
+    void* getGpuForces() const;
 
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index c3fdb36a19..fe83696934 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -315,8 +315,7 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
  * \returns       A pointer to the force buffer in GPU memory
  */
 CUDA_FUNC_QUALIFIER
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu gmx_unused* nb)
-        CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 } // namespace Nbnxm
 #endif
-- 
2.22.0