From 03a138de53c34f55fca2c3f998f0a85d520f3b0a Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Thu, 11 Mar 2021 13:48:06 +0300
Subject: [PATCH] Use DeviceBuffer in GPU update and NBNXM code
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

... instead of raw device pointers and DeviceBuffer<float3>.

We try to use DeviceBuffer<Float3>, but in some places we have to use
DeviceVector<gmx::RVec>, until we can define FloatN types without
including any backend-specific headers. Currently, Float3 is defined as
gmx::RVec, so this should not cause any issues.

Also added some helper functions to convert RVec Ì<-> Float3 <-> float3.

Preparation for #3932 and #3941.
---
 src/gromacs/gpu_utils/gputraits.h             | 52 ++++++++++++++++++-
 src/gromacs/gpu_utils/typecasts.cuh           | 23 +++++++-
 src/gromacs/mdlib/gpuforcereduction.h         |  2 +-
 src/gromacs/mdlib/gpuforcereduction_impl.cpp  |  2 +-
 src/gromacs/mdlib/gpuforcereduction_impl.cu   |  4 +-
 src/gromacs/mdlib/gpuforcereduction_impl.cuh  |  5 +-
 src/gromacs/mdlib/leapfrog_gpu.cu             | 27 +++++-----
 src/gromacs/mdlib/leapfrog_gpu.h              | 16 +++---
 src/gromacs/mdlib/leapfrog_gpu_sycl.cpp       | 34 ++++++------
 src/gromacs/mdlib/lincs_gpu.cu                | 28 ++++++----
 src/gromacs/mdlib/lincs_gpu.cuh               | 17 +++---
 src/gromacs/mdlib/settle_gpu.cu               | 32 ++++++++----
 src/gromacs/mdlib/settle_gpu.cuh              | 21 ++++----
 src/gromacs/mdlib/tests/constrtestrunners.cu  | 29 ++++++-----
 .../mdlib/tests/leapfrogtestrunners_gpu.cpp   | 13 +++--
 src/gromacs/mdlib/tests/settletestrunners.cu  | 22 ++++----
 .../mdlib/update_constrain_gpu_impl.cu        | 18 +++----
 src/gromacs/mdlib/update_constrain_gpu_impl.h | 16 +++---
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  2 +-
 src/gromacs/nbnxm/nbnxm.cpp                   |  2 +-
 src/gromacs/nbnxm/nbnxm.h                     |  2 +-
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  3 +-
 22 files changed, 230 insertions(+), 140 deletions(-)
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h
index 38c5edf8a9..344b0427c1 100644
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -66,6 +66,56 @@ using DeviceTexture = void*;
 //! \brief Single GPU call timing event
 using CommandEvent = void*;
 
+// Stubs for CPU-only build. Might be changed in #3312.
+struct Float2
+{
+};
+struct Float3
+{
+};
+struct Float4
+{
+};
+
 #endif // GMX_GPU
 
+namespace gmx
+{
+template<typename T>
+static inline Float3* asGenericFloat3Pointer(T* in)
+{
+    static_assert(sizeof(T) == sizeof(Float3),
+                  "Size of the host-side data-type is different from the size of the generic "
+                  "device-side counterpart.");
+    return reinterpret_cast<Float3*>(in);
+}
+
+template<typename T>
+static inline const Float3* asGenericFloat3Pointer(const T* in)
+{
+    static_assert(sizeof(T) == sizeof(Float3),
+                  "Size of the host-side data-type is different from the size of the generic "
+                  "device-side counterpart.");
+    return reinterpret_cast<const Float3*>(in);
+}
+
+template<typename C>
+static inline Float3* asGenericFloat3Pointer(C& in)
+{
+    static_assert(sizeof(*in.data()) == sizeof(Float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<Float3*>(in.data());
+}
+
+template<typename C>
+static inline const Float3* asGenericFloat3Pointer(const C& in)
+{
+    static_assert(sizeof(*in.data()) == sizeof(Float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<const Float3*>(in.data());
+}
+} // namespace gmx
+
 #endif // GMX_GPU_UTILS_GPUTRAITS_H
diff --git a/src/gromacs/gpu_utils/typecasts.cuh b/src/gromacs/gpu_utils/typecasts.cuh
index 1dd63b7193..d98c587703 100644
--- a/src/gromacs/gpu_utils/typecasts.cuh
+++ b/src/gromacs/gpu_utils/typecasts.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -60,4 +60,25 @@ static inline __host__ __device__ float3* asFloat3(gmx::RVec* in)
     return reinterpret_cast<float3*>(in);
 }
 
+/*! \brief Cast pointer RVec buffer to a pointer to float3 buffer.
+ *
+ * \param[in] in The Pointer to RVec buffer to cast.
+ *
+ * \returns Buffer pointer, casted to float3*.
+ */
+static inline __host__ __device__ float3** asFloat3Pointer(gmx::RVec** in)
+{
+    static_assert(sizeof((*in)[0]) == sizeof(float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<float3**>(in);
+}
+static inline __host__ __device__ const float3* const* asFloat3Pointer(const gmx::RVec* const* in)
+{
+    static_assert(sizeof((*in)[0]) == sizeof(float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<const float3* const*>(in);
+}
+
 #endif // GMX_GPU_UTILS_TYPECASTS_CUH
diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h
index 157c4c7eca..b23df660ed 100644
--- a/src/gromacs/mdlib/gpuforcereduction.h
+++ b/src/gromacs/mdlib/gpuforcereduction.h
@@ -86,7 +86,7 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerNbnxmForce(void* forcePtr);
+    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
 
     /*! \brief Register a rvec-format force to be reduced
      *
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
index 1e0a30b2b6..b431fbad49 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -76,7 +76,7 @@ void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu
index 6e1e7e920a..dab7d4da0c 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -218,9 +218,9 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
 {
 }
 
-void GpuForceReduction::registerNbnxmForce(void* forcePtr)
+void GpuForceReduction::registerNbnxmForce(DeviceBuffer<Float3> forcePtr)
 {
-    impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+    impl_->registerNbnxmForce(forcePtr);
 }
 
 void GpuForceReduction::registerRvecForce(void* forcePtr)
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
index bd222e40a6..c7d9493c82 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cuh
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
@@ -45,6 +45,7 @@
 
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/math/vectypes.h"
 
 #include "gpuforcereduction.h"
@@ -82,13 +83,13 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+    void registerNbnxmForce(DeviceBuffer<Float3> forcePtr);
 
     /*! \brief Register a rvec-format force to be reduced
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+    void registerRvecForce(DeviceBuffer<Float3> forcePtr);
 
     /*! \brief Add a dependency for this force reduction
      *
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu
index 2f5b589870..75b1026cfc 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -57,6 +57,7 @@
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdtypes/group.h"
@@ -237,10 +238,10 @@ inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
     return kernelPtr;
 }
 
-void LeapFrogGpu::integrate(const DeviceBuffer<float3>        d_x,
-                            DeviceBuffer<float3>              d_xp,
-                            DeviceBuffer<float3>              d_v,
-                            const DeviceBuffer<float3>        d_f,
+void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
+                            DeviceBuffer<Float3>              d_xp,
+                            DeviceBuffer<Float3>              d_v,
+                            const DeviceBuffer<Float3>        d_f,
                             const real                        dt,
                             const bool                        doTemperatureScaling,
                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
@@ -283,20 +284,22 @@ void LeapFrogGpu::integrate(const DeviceBuffer<float3>        d_x,
                        "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
                        "in GPU version of Leap-Frog integrator.");
             prVelocityScalingMatrixDiagonal_ =
-                    make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
-                                dtPressureCouple * prVelocityScalingMatrix[YY][YY],
-                                dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
+                    Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                            dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                            dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
         }
         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
     }
 
+    // Checking the buffer types against the kernel argument types
+    static_assert(sizeof(*d_inverseMasses_) == sizeof(float));
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
                                                       kernelLaunchConfig_,
                                                       &numAtoms_,
-                                                      &d_x,
-                                                      &d_xp,
-                                                      &d_v,
-                                                      &d_f,
+                                                      asFloat3Pointer(&d_x),
+                                                      asFloat3Pointer(&d_xp),
+                                                      asFloat3Pointer(&d_v),
+                                                      asFloat3Pointer(&d_f),
                                                       &d_inverseMasses_,
                                                       &dt,
                                                       &d_lambdas_,
@@ -345,7 +348,7 @@ void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsig
     reallocateDeviceBuffer(
             &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
     copyToDeviceBuffer(
-            &d_inverseMasses_, (float*)inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 
     // Temperature scale group map only used if there are more then one group
     if (numTempScaleValues_ > 1)
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h
index d7c77ff756..738437aa0d 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -47,20 +47,16 @@
 #include "config.h"
 
 #if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/devicebuffer.cuh"
 #    include "gromacs/gpu_utils/gputraits.cuh"
 #endif
 #if GMX_GPU_SYCL
-#    include "gromacs/gpu_utils/devicebuffer_sycl.h"
 #    include "gromacs/gpu_utils/gputraits_sycl.h"
-using float3 = Float3;
 #endif
 
 #include <memory>
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/hostallocator.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pbcutil/pbc_aiuc.h"
 #include "gromacs/utility/arrayref.h"
 
 class DeviceContext;
@@ -125,10 +121,10 @@ public:
      * \param[in]     dtPressureCouple         Period between pressure coupling steps
      * \param[in]     prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
      */
-    void integrate(const DeviceBuffer<float3>        d_x,
-                   DeviceBuffer<float3>              d_xp,
-                   DeviceBuffer<float3>              d_v,
-                   const DeviceBuffer<float3>        d_f,
+    void integrate(DeviceBuffer<Float3>              d_x,
+                   DeviceBuffer<Float3>              d_xp,
+                   DeviceBuffer<Float3>              d_v,
+                   const DeviceBuffer<Float3>        d_f,
                    const real                        dt,
                    const bool                        doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
@@ -192,7 +188,7 @@ private:
     int numTempScaleGroupsAlloc_ = -1;
 
     //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
-    float3 prVelocityScalingMatrixDiagonal_;
+    Float3 prVelocityScalingMatrixDiagonal_;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
index 9afb0320fb..b0e2583a3f 100644
--- a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
@@ -84,15 +84,15 @@ using cl::sycl::access::mode;
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 auto leapFrogKernel(
         cl::sycl::handler&                          cgh,
-        DeviceAccessor<float3, mode::read_write>    a_x,
-        DeviceAccessor<float3, mode::discard_write> a_xp,
-        DeviceAccessor<float3, mode::read_write>    a_v,
-        DeviceAccessor<float3, mode::read>          a_f,
+        DeviceAccessor<Float3, mode::read_write>    a_x,
+        DeviceAccessor<Float3, mode::discard_write> a_xp,
+        DeviceAccessor<Float3, mode::read_write>    a_v,
+        DeviceAccessor<Float3, mode::read>          a_f,
         DeviceAccessor<float, mode::read>           a_inverseMasses,
         float                                       dt,
         OptionalAccessor<float, mode::read, numTempScaleValues != NumTempScaleValues::None> a_lambdas,
         OptionalAccessor<unsigned short, mode::read, numTempScaleValues == NumTempScaleValues::Multiple> a_tempScaleGroups,
-        float3 prVelocityScalingMatrixDiagonal)
+        Float3 prVelocityScalingMatrixDiagonal)
 {
     cgh.require(a_x);
     cgh.require(a_xp);
@@ -109,9 +109,9 @@ auto leapFrogKernel(
     }
 
     return [=](cl::sycl::id<1> itemIdx) {
-        const float3 x    = a_x[itemIdx];
-        const float3 v    = a_v[itemIdx];
-        const float3 f    = a_f[itemIdx];
+        const Float3 x    = a_x[itemIdx];
+        const Float3 v    = a_v[itemIdx];
+        const Float3 f    = a_f[itemIdx];
         const float  im   = a_inverseMasses[itemIdx];
         const float  imdt = im * dt;
 
@@ -137,20 +137,20 @@ auto leapFrogKernel(
             }
         }();
 
-        const float3 prVelocityDelta = [=]() {
+        const Float3 prVelocityDelta = [=]() {
             if constexpr (velocityScaling == VelocityScalingType::Diagonal)
             {
-                return float3{ prVelocityScalingMatrixDiagonal[0] * v[0],
+                return Float3{ prVelocityScalingMatrixDiagonal[0] * v[0],
                                prVelocityScalingMatrixDiagonal[1] * v[1],
                                prVelocityScalingMatrixDiagonal[2] * v[2] };
             }
             else if constexpr (velocityScaling == VelocityScalingType::None)
             {
-                return float3{ 0, 0, 0 };
+                return Float3{ 0, 0, 0 };
             }
         }();
 
-        const float3 v_new = v * lambda - prVelocityDelta + f * imdt;
+        const Float3 v_new = v * lambda - prVelocityDelta + f * imdt;
         a_v[itemIdx]       = v_new;
         a_x[itemIdx]       = x + v_new * dt;
     };
@@ -216,10 +216,10 @@ static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues  tempScali
             prVelocityScalingType);
 }
 
-void LeapFrogGpu::integrate(DeviceBuffer<float3>              d_x,
-                            DeviceBuffer<float3>              d_xp,
-                            DeviceBuffer<float3>              d_v,
-                            DeviceBuffer<float3>              d_f,
+void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
+                            DeviceBuffer<Float3>              d_xp,
+                            DeviceBuffer<Float3>              d_v,
+                            DeviceBuffer<Float3>              d_f,
                             const real                        dt,
                             const bool                        doTemperatureScaling,
                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
@@ -253,7 +253,7 @@ void LeapFrogGpu::integrate(DeviceBuffer<float3>              d_x,
                    "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
                    "in GPU version of Leap-Frog integrator.");
         prVelocityScalingMatrixDiagonal_ = dtPressureCouple
-                                           * float3{ prVelocityScalingMatrix[XX][XX],
+                                           * Float3{ prVelocityScalingMatrix[XX][XX],
                                                      prVelocityScalingMatrix[YY][YY],
                                                      prVelocityScalingMatrix[ZZ][ZZ] };
     }
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu
index 0967c20781..466c250f4c 100644
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -59,7 +59,8 @@
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
@@ -427,14 +428,14 @@ inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVir
     return kernelPtr;
 }
 
-void LincsGpu::apply(const float3* d_x,
-                     float3*       d_xp,
-                     const bool    updateVelocities,
-                     float3*       d_v,
-                     const real    invdt,
-                     const bool    computeVirial,
-                     tensor        virialScaled,
-                     const PbcAiuc pbcAiuc)
+void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
+                     DeviceBuffer<Float3>       d_xp,
+                     const bool                 updateVelocities,
+                     DeviceBuffer<Float3>       d_v,
+                     const real                 invdt,
+                     const bool                 computeVirial,
+                     tensor                     virialScaled,
+                     const PbcAiuc              pbcAiuc)
 {
     ensureNoPendingDeviceError("In CUDA version of LINCS");
 
@@ -479,8 +480,13 @@ void LincsGpu::apply(const float3* d_x,
 
     kernelParams_.pbcAiuc = pbcAiuc;
 
-    const auto kernelArgs =
-            prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
+    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
+                                                      config,
+                                                      &kernelParams_,
+                                                      asFloat3Pointer(&d_x),
+                                                      asFloat3Pointer(&d_xp),
+                                                      asFloat3Pointer(&d_v),
+                                                      &invdt);
 
     launchGpuKernel(kernelPtr,
                     config,
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh
index 40433efcb7..6507892649 100644
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -46,6 +46,7 @@
 
 #include <memory>
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
@@ -134,14 +135,14 @@ public:
      * \param[in,out] virialScaled      Scaled virial tensor to be updated.
      * \param[in]     pbcAiuc           PBC data.
      */
-    void apply(const float3* d_x,
-               float3*       d_xp,
-               const bool    updateVelocities,
-               float3*       d_v,
-               const real    invdt,
-               const bool    computeVirial,
-               tensor        virialScaled,
-               const PbcAiuc pbcAiuc);
+    void apply(const DeviceBuffer<Float3> d_x,
+               DeviceBuffer<Float3>       d_xp,
+               const bool                 updateVelocities,
+               DeviceBuffer<Float3>       d_v,
+               const real                 invdt,
+               const bool                 computeVirial,
+               tensor                     virialScaled,
+               const PbcAiuc              pbcAiuc);
 
     /*! \brief
      * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu
index eb4b495772..0cc25c50ce 100644
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -59,7 +59,8 @@
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
@@ -396,14 +397,14 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi
     return kernelPtr;
 }
 
-void SettleGpu::apply(const float3* d_x,
-                      float3*       d_xp,
-                      const bool    updateVelocities,
-                      float3*       d_v,
-                      const real    invdt,
-                      const bool    computeVirial,
-                      tensor        virialScaled,
-                      const PbcAiuc pbcAiuc)
+void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
+                      DeviceBuffer<Float3>       d_xp,
+                      const bool                 updateVelocities,
+                      DeviceBuffer<Float3>       d_v,
+                      const real                 invdt,
+                      const bool                 computeVirial,
+                      tensor                     virialScaled,
+                      const PbcAiuc              pbcAiuc)
 {
 
     ensureNoPendingDeviceError("In CUDA version SETTLE");
@@ -440,8 +441,17 @@ void SettleGpu::apply(const float3* d_x,
         config.sharedMemorySize = 0;
     }
 
-    const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, &numSettles_, &d_atomIds_, &settleParameters_, &d_x, &d_xp, &invdt, &d_v, &d_virialScaled_, &pbcAiuc);
+    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
+                                                      config,
+                                                      &numSettles_,
+                                                      &d_atomIds_,
+                                                      &settleParameters_,
+                                                      asFloat3Pointer(&d_x),
+                                                      asFloat3Pointer(&d_xp),
+                                                      &invdt,
+                                                      asFloat3Pointer(&d_v),
+                                                      &d_virialScaled_,
+                                                      &pbcAiuc);
 
     launchGpuKernel(kernelPtr,
                     config,
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh
index 3a96ec4d39..f09fbd344f 100644
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,9 +45,10 @@
 
 #include "gmxpre.h"
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
-#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/vec.h"
@@ -101,14 +102,14 @@ public:
      * \param[in,out] virialScaled      Scaled virial tensor to be updated.
      * \param[in]     pbcAiuc           PBC data.
      */
-    void apply(const float3* d_x,
-               float3*       d_xp,
-               const bool    updateVelocities,
-               float3*       d_v,
-               const real    invdt,
-               const bool    computeVirial,
-               tensor        virialScaled,
-               const PbcAiuc pbcAiuc);
+    void apply(const DeviceBuffer<Float3> d_x,
+               DeviceBuffer<Float3>       d_xp,
+               const bool                 updateVelocities,
+               DeviceBuffer<Float3>       d_v,
+               const real                 invdt,
+               const bool                 computeVirial,
+               tensor                     virialScaled,
+               const PbcAiuc              pbcAiuc);
 
     /*! \brief
      * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu
index 94be8b35f9..c22b91f351 100644
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -52,6 +52,7 @@
 #include <vector>
 
 #include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdlib/lincs_gpu.cuh"
 #include "gromacs/pbcutil/pbc.h"
@@ -71,9 +72,14 @@ void LincsDeviceConstraintsRunner::applyConstraints(ConstraintsTestData* testDat
     auto lincsGpu = std::make_unique<LincsGpu>(
             testData->ir_.nLincsIter, testData->ir_.nProjOrder, deviceContext, deviceStream);
 
-    bool    updateVelocities = true;
-    int     numAtoms         = testData->numAtoms_;
-    float3 *d_x, *d_xp, *d_v;
+    bool updateVelocities = true;
+    int  numAtoms         = testData->numAtoms_;
+
+    Float3* h_x  = gmx::asGenericFloat3Pointer(testData->x_);
+    Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_);
+    Float3* h_v  = gmx::asGenericFloat3Pointer(testData->v_);
+
+    DeviceBuffer<Float3> d_x, d_xp, d_v;
 
     lincsGpu->set(*testData->idef_, testData->numAtoms_, testData->invmass_.data());
     PbcAiuc pbcAiuc;
@@ -83,24 +89,19 @@ void LincsDeviceConstraintsRunner::applyConstraints(ConstraintsTestData* testDat
     allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
-    copyToDeviceBuffer(
-            &d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(
-            &d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyToDeviceBuffer(
-                &d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+        copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     }
     lincsGpu->apply(
             d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_, testData->virialScaled_, pbcAiuc);
 
-    copyFromDeviceBuffer(
-            (float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyFromDeviceBuffer(
-                (float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+        copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     }
 
     freeDeviceBuffer(&d_x);
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
index b85ed94b52..97c9c29481 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
@@ -59,6 +59,7 @@
 #    include "gromacs/mdlib/leapfrog_gpu.h"
 #endif
 
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdlib/stat.h"
 
@@ -76,14 +77,12 @@ void LeapFrogDeviceTestRunner::integrate(LeapFrogTestData* testData, int numStep
 
     int numAtoms = testData->numAtoms_;
 
-    static_assert(sizeof(float3) == sizeof(*testData->x_.data()), "Incompatible types");
+    Float3* h_x  = gmx::asGenericFloat3Pointer(testData->x_);
+    Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_);
+    Float3* h_v  = gmx::asGenericFloat3Pointer(testData->v_);
+    Float3* h_f  = gmx::asGenericFloat3Pointer(testData->f_);
 
-    float3* h_x  = reinterpret_cast<float3*>(testData->x_.data());
-    float3* h_xp = reinterpret_cast<float3*>(testData->xPrime_.data());
-    float3* h_v  = reinterpret_cast<float3*>(testData->v_.data());
-    float3* h_f  = reinterpret_cast<float3*>(testData->f_.data());
-
-    DeviceBuffer<float3> d_x, d_xp, d_v, d_f;
+    DeviceBuffer<Float3> d_x, d_xp, d_v, d_f;
 
     allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu
index 0d2a3e116b..7aab750547 100644
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -52,6 +52,7 @@
 #include <vector>
 
 #include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/utility/unique_cptr.h"
@@ -85,30 +86,29 @@ void SettleDeviceTestRunner::applySettle(SettleTestData* testData,
 
     int numAtoms = testData->numAtoms_;
 
-    float3 *d_x, *d_xp, *d_v;
+    DeviceBuffer<Float3> d_x, d_xp, d_v;
 
-    float3* h_x  = (float3*)(as_rvec_array(testData->x_.data()));
-    float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data()));
-    float3* h_v  = (float3*)(as_rvec_array(testData->v_.data()));
+    Float3* h_x  = gmx::asGenericFloat3Pointer(testData->x_);
+    Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_);
+    Float3* h_v  = gmx::asGenericFloat3Pointer(testData->v_);
 
     allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
     allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
 
-    copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
-    copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+        copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     }
     settleGpu->apply(
             d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial, testData->virial_, pbcAiuc);
 
-    copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     if (updateVelocities)
     {
-        copyFromDeviceBuffer(
-                (float3*)h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+        copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
     }
 
     freeDeviceBuffer(&d_x);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index f03ab778c3..b561856511 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -240,9 +240,9 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
 
 UpdateConstrainGpu::Impl::~Impl() {}
 
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
-                                   DeviceBuffer<RVec>            d_v,
-                                   const DeviceBuffer<RVec>      d_f,
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<Float3>          d_x,
+                                   DeviceBuffer<Float3>          d_v,
+                                   const DeviceBuffer<Float3>    d_f,
                                    const InteractionDefinitions& idef,
                                    const t_mdatoms&              md)
 {
@@ -254,9 +254,9 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
     GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
     GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
 
-    d_x_ = reinterpret_cast<float3*>(d_x);
-    d_v_ = reinterpret_cast<float3*>(d_v);
-    d_f_ = reinterpret_cast<float3*>(d_f);
+    d_x_ = d_x;
+    d_v_ = d_v;
+    d_f_ = d_f;
 
     numAtoms_ = md.nr;
 
@@ -334,9 +334,9 @@ void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix)
     impl_->scaleVelocities(scalingMatrix);
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<RVec>            d_x,
-                             DeviceBuffer<RVec>            d_v,
-                             const DeviceBuffer<RVec>      d_f,
+void UpdateConstrainGpu::set(DeviceBuffer<Float3>          d_x,
+                             DeviceBuffer<Float3>          d_v,
+                             const DeviceBuffer<Float3>    d_f,
                              const InteractionDefinitions& idef,
                              const t_mdatoms&              md)
 {
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 8e101b8cd0..76e41398e7 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -150,9 +150,9 @@ public:
      * \param[in] idef                System topology
      * \param[in] md                  Atoms data.
      */
-    void set(DeviceBuffer<RVec>            d_x,
-             DeviceBuffer<RVec>            d_v,
-             const DeviceBuffer<RVec>      d_f,
+    void set(DeviceBuffer<Float3>          d_x,
+             DeviceBuffer<Float3>          d_v,
+             const DeviceBuffer<Float3>    d_f,
              const InteractionDefinitions& idef,
              const t_mdatoms&              md);
 
@@ -193,14 +193,14 @@ private:
     int numAtoms_;
 
     //! Local copy of the pointer to the device positions buffer
-    float3* d_x_;
+    DeviceBuffer<Float3> d_x_;
     //! Local copy of the pointer to the device velocities buffer
-    float3* d_v_;
+    DeviceBuffer<Float3> d_v_;
     //! Local copy of the pointer to the device forces buffer
-    float3* d_f_;
+    DeviceBuffer<Float3> d_f_;
 
     //! Device buffer for intermediate positions (maintained internally)
-    float3* d_xp_;
+    DeviceBuffer<Float3> d_xp_;
     //! Number of elements in shifted coordinates buffer
     int numXp_ = -1;
     //! Allocation size for the shifted coordinates buffer
@@ -208,7 +208,7 @@ private:
 
 
     //! 1/mass for all atoms (GPU)
-    real* d_inverseMasses_;
+    DeviceBuffer<real> d_inverseMasses_;
     //! Number of elements in reciprocal masses buffer
     int numInverseMasses_ = -1;
     //! Allocation size for the reciprocal masses buffer
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index bd5fa8de5d..76341b7a97 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -778,7 +778,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-void* getGpuForces(NbnxmGpu* nb)
+DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
 {
     return nb->atdat->f;
 }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index a8e919ea01..196befc92d 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -213,7 +213,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
     return numAtoms;
 }
 
-void* nonbonded_verlet_t::getGpuForces() const
+DeviceBuffer<gmx::RVec> nonbonded_verlet_t::getGpuForces() const
 {
     return Nbnxm::getGpuForces(gpu_nbv);
 }
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 30380ebeb2..a36efad023 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -395,7 +395,7 @@ public:
      *
      * \returns A pointer to the force buffer in GPU memory
      */
-    void* getGpuForces() const;
+    DeviceBuffer<gmx::RVec> getGpuForces() const;
 
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index fec64bbe68..ebdba45b9d 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -313,7 +313,8 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
  * \returns       A pointer to the force buffer in GPU memory
  */
 CUDA_FUNC_QUALIFIER
-void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu gmx_unused* nb)
+        CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
 
 } // namespace Nbnxm
 #endif
-- 
2.22.0