From c5c220a03663d975e31e8573c1849247ce3f8ad0 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 20 Jan 2020 18:32:46 +0100
Subject: [PATCH] Use RVec instead of float for x, v and f device buffers

Using RVec instead of float for coordinates data-types allows to
remove multiplications by DIM when the adresses, offsets and sizes
are computed. Since the native device types are not used in CPU
part of the code, the type casting remains.

Refs #3312 and #2936

Change-Id: Iaea914a474195f214ca860f7345f6878b9a04813
---
 src/gromacs/domdec/gpuhaloexchange.h          |  4 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cpp   |  6 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cu    |  8 +-
 src/gromacs/ewald/pme.h                       |  4 +-
 src/gromacs/ewald/pme_calculate_splines.cuh   | 28 +++++-
 .../ewald/pme_coordinate_receiver_gpu.h       |  3 +-
 .../pme_coordinate_receiver_gpu_impl.cpp      |  2 +-
 .../ewald/pme_coordinate_receiver_gpu_impl.cu |  6 +-
 .../ewald/pme_coordinate_receiver_gpu_impl.h  |  2 +-
 src/gromacs/ewald/pme_gather.cu               | 20 ++---
 src/gromacs/ewald/pme_gpu.cpp                 |  2 +-
 src/gromacs/ewald/pme_gpu_internal.cpp        |  2 +-
 src/gromacs/ewald/pme_gpu_internal.h          |  2 +-
 src/gromacs/ewald/pme_gpu_types.h             |  2 +-
 src/gromacs/ewald/pme_spread.cu               | 17 ++--
 src/gromacs/gpu_utils/cudautils.cuh           | 13 +++
 src/gromacs/gpu_utils/devicebuffer_ocl.h      |  2 +-
 src/gromacs/gpu_utils/tests/CMakeLists.txt    |  3 +-
 src/gromacs/gpu_utils/tests/gpu_utils.cpp     | 89 +++++++++++++++++++
 src/gromacs/mdlib/update_constrain_gpu.h      | 12 +--
 .../mdlib/update_constrain_gpu_impl.cpp       |  6 +-
 .../mdlib/update_constrain_gpu_impl.cu        | 24 ++---
 src/gromacs/mdlib/update_constrain_gpu_impl.h | 12 +--
 .../mdtypes/state_propagator_data_gpu.h       |  6 +-
 .../state_propagator_data_gpu_impl.cpp        | 12 +--
 .../mdtypes/state_propagator_data_gpu_impl.h  | 16 ++--
 .../state_propagator_data_gpu_impl_gpu.cpp    | 42 ++++-----
 src/gromacs/nbnxm/atomdata.cpp                |  4 +-
 src/gromacs/nbnxm/atomdata.h                  | 14 +--
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 15 ++--
 src/gromacs/nbnxm/nbnxm.cpp                   |  4 +-
 src/gromacs/nbnxm/nbnxm.h                     | 10 +--
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  6 +-
 33 files changed, 257 insertions(+), 141 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/tests/gpu_utils.cpp
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index b7e6ff54fa..d32b1800c4 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -92,7 +92,7 @@ public:
      * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
      * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
      */
-    void reinitHalo(DeviceBuffer<float> d_coordinateBuffer, DeviceBuffer<float> d_forcesBuffer);
+    void reinitHalo(DeviceBuffer<RVec> d_coordinateBuffer, DeviceBuffer<RVec> d_forcesBuffer);
 
 
     /*! \brief GPU halo exchange of coordinates buffer.
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index 2511673218..a17c550c6c 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
 GpuHaloExchange::~GpuHaloExchange() = default;
 
 /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
-                                 DeviceBuffer<float> /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
+                                 DeviceBuffer<RVec> /* d_forcesBuffer */)
 {
     GMX_ASSERT(false,
                "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 660566a9dd..4313ffacb0 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -57,6 +57,7 @@
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -435,10 +436,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void
 
 GpuHaloExchange::~GpuHaloExchange() = default;
 
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> d_coordinatesBuffer, DeviceBuffer<float> d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
 {
-    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer),
-                      reinterpret_cast<float3*>(d_forcesBuffer));
+    impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
 }
 
 void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index edbe283523..f1f829e445 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -451,8 +451,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t* GPU_FUNC_ARG
  * \param[in] pme            The PME data structure.
  * \param[in] d_x            The pointer to the positions buffer to be set
  */
-GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*    GPU_FUNC_ARGUMENT(pme),
-                                             DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_ARGUMENT(pme),
+                                             DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 
 /*! \brief Get pointer to device copy of force data.
  * \param[in] pme            The PME data structure.
diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_calculate_splines.cuh
index f52c81c599..4e5dc6e888 100644
--- a/src/gromacs/ewald/pme_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_calculate_splines.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -52,6 +52,30 @@
 //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
 static const bool c_useAtomDataPrefetch = true;
 
+/*! \brief Asserts if the argument is finite.
+ *
+ *  The function works for any data type, that can be casted to float. Note that there is also
+ *  a specialized implementation for float3 data type.
+ *
+ * \param[in] arg  Argument to check.
+ */
+template<typename T>
+__device__ inline void assertIsFinite(T arg);
+
+template<>
+__device__ inline void assertIsFinite(float3 arg)
+{
+    assert(isfinite(float(arg.x)));
+    assert(isfinite(float(arg.y)));
+    assert(isfinite(float(arg.z)));
+}
+
+template<typename T>
+__device__ inline void assertIsFinite(T arg)
+{
+    assert(isfinite(float(arg)));
+}
+
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
@@ -79,7 +103,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelPa
             pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
     if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
     {
-        assert(isfinite(float(gm_source[globalIndex])));
+        assertIsFinite(gm_source[globalIndex]);
         sm_destination[localIndex] = gm_source[globalIndex];
     }
 }
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
index 5e4c4967ba..b5d02a719e 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -43,6 +43,7 @@
 #define GMX_PMECOORDINATERECEIVERGPU_H
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -70,7 +71,7 @@ public:
      * send coordinates buffer address to PP rank
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
 
 
     /*! \brief
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
index c026d41b31..0cb848e6c0 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> /* d_x */)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
index 1584a9e844..b2e7fa009d 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -69,7 +69,7 @@ PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::
 
 PmeCoordinateReceiverGpu::Impl::~Impl() = default;
 
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
 
     int ind_start = 0;
@@ -80,7 +80,7 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(Device
         ind_end   = ind_start + receiver.numAtoms;
 
         // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start * DIM]);
+        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
 
 #if GMX_MPI
         MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
@@ -131,7 +131,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void*            pmeStr
 
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
     impl_->sendCoordinateBufferAddressToPpRanks(d_x);
 }
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
index 874fb2747e..4f3bbe2e4e 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -69,7 +69,7 @@ public:
      * send coordinates buffer address to PP rank
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
 
     /*! \brief
      * launch receive of coordinate data from PP rank
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 8b2ff5f80e..616516df23 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
 #include <cassert>
 
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
 
 #include "pme.cuh"
 #include "pme_calculate_splines.cuh"
@@ -321,32 +322,27 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     }
     else
     {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
         /* Recaclulate  Splines  */
         if (c_useAtomDataPrefetch)
         {
             // charges
             __shared__ float sm_coefficients[atomsPerBlock];
             // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
             /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                             kernelParams.atoms.d_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
             __syncthreads();
-            atomX.x    = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y    = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z    = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX      = sm_coordinates[atomIndexLocal];
             atomCharge = sm_coefficients[atomIndexLocal];
         }
         else
         {
+            atomX      = gm_coordinates[atomIndexGlobal];
             atomCharge = gm_coefficients[atomIndexGlobal];
-            atomX.x    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
         }
         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index aafd5c1a4d..44fafc205c 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -440,7 +440,7 @@ void* pme_gpu_get_device_f(const gmx_pme_t* pme)
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
 
-void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<float> d_x)
+void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
 {
     GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 9c98402525..1aa2052907 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1523,7 +1523,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
 }
 
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
 {
     GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
                "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index cc7e9d1f34..2816a22add 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -395,7 +395,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu*                GPU_FUNC_ARGUMENT(
  * \param[in] d_x            Pointer to coordinate data
  */
 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
-                                                            DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+                                                            DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 
 /*! \brief Return pointer to device copy of force data.
  * \param[in] pmeGpu         The PME GPU structure.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h
index be501f2cfa..5ddd79fb5a 100644
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -157,7 +157,7 @@ struct PmeGpuAtomParams
      * The coordinates themselves change and need to be copied to the GPU for every PME computation,
      * but reallocation happens only at DD.
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coordinates;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
     /*! \brief Global GPU memory array handle with input atom charges.
      * The charges only need to be reallocated and copied to the GPU at DD step.
      */
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index 3d02e43d0d..99f7828c86 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,6 +47,7 @@
 #include <cassert>
 
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
 
 #include "pme.cuh"
 #include "pme_calculate_splines.cuh"
@@ -228,24 +229,20 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
 
     if (computeSplines)
     {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
         if (c_useAtomDataPrefetch)
         {
             // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
             __syncthreads();
-            atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX = sm_coordinates[atomIndexLocal];
         }
         else
         {
-            atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
+            atomX = gm_coordinates[atomIndexGlobal];
         }
         calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 71d9b7dac4..6f41e8589c 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -215,6 +215,19 @@ static inline void rvec_inc(rvec a, const float3 b)
     rvec tmp = { b.x, b.y, b.z };
     rvec_inc(a, tmp);
 }
+/*! \brief Cast RVec buffer to float3 buffer.
+ *
+ * \param[in] in The RVec buffer to cast.
+ *
+ * \returns Buffer, casted to float3*.
+ */
+static inline __host__ __device__ float3* asFloat3(gmx::RVec* in)
+{
+    static_assert(sizeof(in[0]) == sizeof(float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<float3*>(in);
+}
 
 /*! \brief Wait for all taks in stream \p s to complete.
  *
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index 2c92c81c1b..40f1e12941 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -228,7 +228,7 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
     GMX_ASSERT(buffer, "needs a buffer pointer");
     const size_t    offset        = startingOffset * sizeof(ValueType);
     const size_t    bytes         = numValues * sizeof(ValueType);
-    const ValueType pattern       = 0;
+    const int       pattern       = 0;
     const cl_uint   numWaitEvents = 0;
     const cl_event* waitEvents    = nullptr;
     cl_event        commandEvent;
diff --git a/src/gromacs/gpu_utils/tests/CMakeLists.txt b/src/gromacs/gpu_utils/tests/CMakeLists.txt
index c2fc3426bb..e9a62b792b 100644
--- a/src/gromacs/gpu_utils/tests/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@ if(GMX_USE_CUDA)
     # CUDA-only test
     list(APPEND SOURCES_FROM_CXX
         pinnedmemorychecker.cpp
+        gpu_utils.cpp
         )
     # TODO Making a separate library is heavy handed, but nothing else
     # seems to work. Also don't use a hyphen in its name, because nvcc
diff --git a/src/gromacs/gpu_utils/tests/gpu_utils.cpp b/src/gromacs/gpu_utils/tests/gpu_utils.cpp
new file mode 100644
index 0000000000..5adf0a72a7
--- /dev/null
+++ b/src/gromacs/gpu_utils/tests/gpu_utils.cpp
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for CUDA float3 type layout.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <vector>
+
+#ifndef __CUDA_ARCH__
+/*! \brief Dummy definition to avoid compiler error
+ *
+ * \todo Find a better solution. Probably, move asFloat3(...) function to different header.
+ */
+#    define __CUDA_ARCH__ -1
+#    include <cuda_runtime.h>
+#    undef __CUDA_ARCH__
+#else
+#    include <cuda_runtime.h>
+#endif
+#include <gtest/gtest.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/real.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+namespace gmx
+{
+
+namespace test
+{
+
+TEST(GpuDataTypesCompatibilityTest, RVecAndFloat3)
+{
+    std::vector<RVec> dataRVec;
+    dataRVec.emplace_back(1.0, 2.0, 3.0);
+    dataRVec.emplace_back(4.0, 5.0, 6.0);
+    float3* dataFloat3 = asFloat3(dataRVec.data());
+    EXPECT_EQ(dataFloat3[0].x, dataRVec[0][XX]);
+    EXPECT_EQ(dataFloat3[0].y, dataRVec[0][YY]);
+    EXPECT_EQ(dataFloat3[0].z, dataRVec[0][ZZ]);
+    EXPECT_EQ(dataFloat3[1].x, dataRVec[1][XX]);
+    EXPECT_EQ(dataFloat3[1].y, dataRVec[1][YY]);
+    EXPECT_EQ(dataFloat3[1].z, dataRVec[1][ZZ]);
+}
+
+} // namespace test
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_CUDA
\ No newline at end of file
diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h
index e0190c367f..359ecea5c1 100644
--- a/src/gromacs/mdlib/update_constrain_gpu.h
+++ b/src/gromacs/mdlib/update_constrain_gpu.h
@@ -133,12 +133,12 @@ public:
      * \param[in]      md                  Atoms data.
      * \param[in]      numTempScaleValues  Number of temperature scaling groups. Zero for no temperature scaling.
      */
-    void set(DeviceBuffer<float> d_x,
-             DeviceBuffer<float> d_v,
-             DeviceBuffer<float> d_f,
-             const t_idef&       idef,
-             const t_mdatoms&    md,
-             int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec> d_x,
+             DeviceBuffer<RVec> d_v,
+             DeviceBuffer<RVec> d_f,
+             const t_idef&      idef,
+             const t_mdatoms&   md,
+             int                numTempScaleValues);
 
     /*! \brief
      * Update PBC data.
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
index 919ba9b36b..47671ef7de 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -88,9 +88,9 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */)
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<float> /* d_x */,
-                             DeviceBuffer<float> /* d_v */,
-                             const DeviceBuffer<float> /* d_f */,
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
+                             DeviceBuffer<RVec> /* d_v */,
+                             const DeviceBuffer<RVec> /* d_f */,
                              const t_idef& /* idef */,
                              const t_mdatoms& /* md */,
                              const int /* numTempScaleValues */)
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index ebbe15a02f..6991ef0dc3 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -188,12 +188,12 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
 
 UpdateConstrainGpu::Impl::~Impl() {}
 
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<float>       d_x,
-                                   DeviceBuffer<float>       d_v,
-                                   const DeviceBuffer<float> d_f,
-                                   const t_idef&             idef,
-                                   const t_mdatoms&          md,
-                                   const int                 numTempScaleValues)
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>       d_x,
+                                   DeviceBuffer<RVec>       d_v,
+                                   const DeviceBuffer<RVec> d_f,
+                                   const t_idef&            idef,
+                                   const t_mdatoms&         md,
+                                   const int                numTempScaleValues)
 {
     GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
     GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
@@ -259,12 +259,12 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix)
     impl_->scaleCoordinates(scalingMatrix);
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<float>       d_x,
-                             DeviceBuffer<float>       d_v,
-                             const DeviceBuffer<float> d_f,
-                             const t_idef&             idef,
-                             const t_mdatoms&          md,
-                             const int                 numTempScaleValues)
+void UpdateConstrainGpu::set(DeviceBuffer<RVec>       d_x,
+                             DeviceBuffer<RVec>       d_v,
+                             const DeviceBuffer<RVec> d_f,
+                             const t_idef&            idef,
+                             const t_mdatoms&         md,
+                             const int                numTempScaleValues)
 {
     impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
 }
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 5a28045afd..0009112dc6 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -133,12 +133,12 @@ public:
      * \param[in] md                  Atoms data.
      * \param[in] numTempScaleValues  Number of temperature scaling groups. Set zero for no temperature coupling.
      */
-    void set(DeviceBuffer<float>       d_x,
-             DeviceBuffer<float>       d_v,
-             const DeviceBuffer<float> d_f,
-             const t_idef&             idef,
-             const t_mdatoms&          md,
-             const int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec>       d_x,
+             DeviceBuffer<RVec>       d_v,
+             const DeviceBuffer<RVec> d_f,
+             const t_idef&            idef,
+             const t_mdatoms&         md,
+             const int                numTempScaleValues);
 
     /*! \brief
      * Update PBC data.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 44cbc8d7e5..d75cd78ea1 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -184,7 +184,7 @@ public:
      *
      *  \returns GPU positions buffer.
      */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
 
     /*! \brief Copy positions to the GPU memory.
      *
@@ -245,7 +245,7 @@ public:
      *
      *  \returns GPU velocities buffer.
      */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
 
     /*! \brief Copy velocities to the GPU memory.
      *
@@ -280,7 +280,7 @@ public:
      *
      *  \returns GPU force buffer.
      */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
 
     /*! \brief Copy forces to the GPU memory.
      *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index ae6bcd6c71..15f054eafa 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -95,12 +95,12 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
     return std::make_tuple(0, 0);
 }
 
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
@@ -153,12 +153,12 @@ void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /*
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
@@ -193,12 +193,12 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality /* atomLocal
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f          */,
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index 1423277722..a138bcbb62 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -181,7 +181,7 @@ public:
      *
      *  \returns GPU positions buffer.
      */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
 
     /*! \brief Copy positions to the GPU memory.
      *
@@ -242,7 +242,7 @@ public:
      *
      *  \returns GPU velocities buffer.
      */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
 
     /*! \brief Copy velocities to the GPU memory.
      *
@@ -277,7 +277,7 @@ public:
      *
      *  \returns GPU force buffer.
      */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
 
     /*! \brief Copy forces to the GPU memory.
      *
@@ -395,21 +395,21 @@ private:
     int numAtomsAll_ = -1;
 
     //! Device positions buffer
-    DeviceBuffer<float> d_x_;
+    DeviceBuffer<RVec> d_x_;
     //! Number of particles saved in the positions buffer
     int d_xSize_ = -1;
     //! Allocation size for the positions buffer
     int d_xCapacity_ = -1;
 
     //! Device velocities buffer
-    DeviceBuffer<float> d_v_;
+    DeviceBuffer<RVec> d_v_;
     //! Number of particles saved in the velocities buffer
     int d_vSize_ = -1;
     //! Allocation size for the velocities buffer
     int d_vCapacity_ = -1;
 
     //! Device force buffer
-    DeviceBuffer<float> d_f_;
+    DeviceBuffer<RVec> d_f_;
     //! Number of particles saved in the force buffer
     int d_fSize_ = -1;
     //! Allocation size for the force buffer
@@ -428,7 +428,7 @@ private:
      *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
      *  \param[in]  commandStream  GPU stream to execute copy in.
      */
-    void copyToDevice(DeviceBuffer<float>            d_data,
+    void copyToDevice(DeviceBuffer<RVec>             d_data,
                       gmx::ArrayRef<const gmx::RVec> h_data,
                       int                            dataSize,
                       AtomLocality                   atomLocality,
@@ -443,7 +443,7 @@ private:
      *  \param[in]  commandStream  GPU stream to execute copy in.
      */
     void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                        DeviceBuffer<float>      d_data,
+                        DeviceBuffer<RVec>       d_data,
                         int                      dataSize,
                         AtomLocality             atomLocality,
                         CommandStream            commandStream);
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 031327e80d..4b385a5a7b 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -193,18 +193,18 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
         numAtomsPadded = numAtomsAll_;
     }
 
-    reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
 
     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
     if (paddingAllocationSize > 0)
     {
         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
-        clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_);
+        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
     }
 
-    reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
     const int d_fOldCapacity = d_fCapacity_;
-    reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
     // since the force buffer ops are not implemented in OpenCL.
@@ -249,7 +249,7 @@ std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality
     return std::make_tuple(atomsStartAt, numAtomsToCopy);
 }
 
-void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                  d_data,
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
                                                 int                                  dataSize,
                                                 AtomLocality                         atomLocality,
@@ -269,18 +269,15 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
     int atomsStartAt, numAtomsToCopy;
     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
     if (numAtomsToCopy != 0)
     {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                    "The device allocation is smaller than requested copy range.");
         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                    "The host buffer is smaller than the requested copy range.");
 
-        copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
-                           elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
+                           atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -288,7 +285,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
 }
 
 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                                                  DeviceBuffer<float>      d_data,
+                                                  DeviceBuffer<RVec>       d_data,
                                                   int                      dataSize,
                                                   AtomLocality             atomLocality,
                                                   CommandStream            commandStream)
@@ -307,25 +304,22 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
     int atomsStartAt, numAtomsToCopy;
     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
     if (numAtomsToCopy != 0)
     {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                    "The device allocation is smaller than requested copy range.");
         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                    "The host buffer is smaller than the requested copy range.");
 
-        copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
-                             elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
+                             atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
 {
     return d_x_;
 }
@@ -422,7 +416,7 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomL
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
 {
     return d_v_;
 }
@@ -476,7 +470,7 @@ void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLo
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
 {
     return d_f_;
 }
@@ -595,7 +589,7 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 {
     return impl_->getCoordinates();
 }
@@ -635,7 +629,7 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocalit
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 {
     return impl_->getVelocities();
 }
@@ -662,7 +656,7 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
 {
     return impl_->getForces();
 }
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 89d2e762b8..cd5e346303 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1075,7 +1075,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                     const gmx::AtomLocality locality,
                                     bool                    fillLocal,
                                     NbnxmGpu*               gpu_nbv,
-                                    DeviceBuffer<float>     d_x,
+                                    DeviceBuffer<RVec>      d_x,
                                     GpuEventSynchronizer*   xReadyOnDevice)
 {
 
@@ -1459,7 +1459,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
 void reduceForcesGpu(const gmx::AtomLocality                    locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<RVec>                         totalForcesDevice,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index ceb87f71f0..22d115706e 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -376,12 +376,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet,
  * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
  * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
-                                    gmx::AtomLocality     locality,
-                                    bool                  fillLocal,
-                                    NbnxmGpu*             gpu_nbv,
-                                    DeviceBuffer<float>   d_x,
-                                    GpuEventSynchronizer* xReadyOnDevice);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
+                                    gmx::AtomLocality       locality,
+                                    bool                    fillLocal,
+                                    NbnxmGpu*               gpu_nbv,
+                                    DeviceBuffer<gmx::RVec> d_x,
+                                    GpuEventSynchronizer*   xReadyOnDevice);
 
 /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
  *
@@ -404,7 +404,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnx
  * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
  */
 void reduceForcesGpu(gmx::AtomLocality                          locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 94e99879db..2defa174d6 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -818,7 +818,7 @@ void cuda_set_cacheconfig()
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                            bool                      setFillerCoords,
                            NbnxmGpu*                 nb,
-                           DeviceBuffer<float>       d_x,
+                           DeviceBuffer<gmx::RVec>   d_x,
                            GpuEventSynchronizer*     xReadyOnDevice,
                            const Nbnxm::AtomLocality locality,
                            int                       gridId,
@@ -862,12 +862,13 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
                                         : nbnxn_gpu_x_to_nbat_x_kernel<false>;
         float4*    d_xq          = adat->xq;
+        float3*    d_xFloat3     = asFloat3(d_x);
         const int* d_atomIndices = nb->atomIndices;
         const int* d_cxy_na      = &nb->cxy_na[numColumnsMax * gridId];
         const int* d_cxy_ind     = &nb->cxy_ind[numColumnsMax * gridId];
-        const auto kernelArgs =
-                prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_x, &d_atomIndices,
-                                          &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
+        const auto kernelArgs    = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq,
+                                                          &d_xFloat3, &d_atomIndices, &d_cxy_na,
+                                                          &d_cxy_ind, &cellOffset, &numAtomsPerCell);
         launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
     }
 
@@ -884,7 +885,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
  *       forces only after Local stream already done so.
  */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
-                               DeviceBuffer<float>                        totalForcesDevice,
+                               DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                NbnxmGpu*                                  nb,
                                void*                                      pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
@@ -935,8 +936,8 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     }
 
     const float3* d_fNB    = adat->f;
-    const float3* d_fPme   = (float3*)pmeForcesDevice;
-    float3*       d_fTotal = (float3*)totalForcesDevice;
+    const float3* d_fPme   = static_cast<float3*>(pmeForcesDevice);
+    float3*       d_fTotal = asFloat3(totalForcesDevice);
     const int*    d_cell   = nb->cell;
 
     const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 07b24fc923..ab79a8cf5b 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -140,7 +140,7 @@ void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality        local
 
 void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                const bool              fillLocal,
-                                               DeviceBuffer<float>     d_x,
+                                               DeviceBuffer<gmx::RVec> d_x,
                                                GpuEventSynchronizer*   xReadyOnDevice)
 {
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -178,7 +178,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
 }
 
 void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
-                                                      DeviceBuffer<float>     totalForcesDevice,
+                                                      DeviceBuffer<gmx::RVec> totalForcesDevice,
                                                       void*                   forcesPmeDevice,
                                                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                                       bool useGpuFPmeReduction,
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 50fed0fe5b..2fa353a848 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -286,10 +286,10 @@ public:
      * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
      * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
      */
-    void convertCoordinatesGpu(gmx::AtomLocality     locality,
-                               bool                  fillLocal,
-                               DeviceBuffer<float>   d_x,
-                               GpuEventSynchronizer* xReadyOnDevice);
+    void convertCoordinatesGpu(gmx::AtomLocality       locality,
+                               bool                    fillLocal,
+                               DeviceBuffer<gmx::RVec> d_x,
+                               GpuEventSynchronizer*   xReadyOnDevice);
 
     //! Init for GPU version of setup coordinates in Nbnxm
     void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -349,7 +349,7 @@ public:
      * \param [in]     accumulateForce      If the total force buffer already contains data
      */
     void atomdata_add_nbat_f_to_f_gpu(gmx::AtomLocality                          locality,
-                                      DeviceBuffer<float>                        totalForcesDevice,
+                                      DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                       void*                                      forcesPmeDevice,
                                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                       bool useGpuFPmeReduction,
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 7b9e4b80f9..2370fc836b 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -242,8 +242,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                            bool gmx_unused setFillerCoords,
-                           NbnxmGpu gmx_unused* gpu_nbv,
-                           DeviceBuffer<float> gmx_unused d_x,
+                           NbnxmGpu gmx_unused*    gpu_nbv,
+                           DeviceBuffer<gmx::RVec> gmx_unused d_x,
                            GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                            gmx::AtomLocality gmx_unused locality,
                            int gmx_unused gridId,
@@ -312,7 +312,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
-                               DeviceBuffer<float> gmx_unused totalForcesDevice,
+                               DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
                                NbnxmGpu gmx_unused* gpu_nbv,
                                void gmx_unused*                           pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
-- 
2.22.0