Use RVec instead of float for x, v and f device buffers
authorArtem Zhmurov <zhmurov@gmail.com>
Mon, 20 Jan 2020 17:32:46 +0000 (18:32 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Thu, 6 Feb 2020 07:49:39 +0000 (08:49 +0100)
Using RVec instead of float for coordinates data-types allows to
remove multiplications by DIM when the adresses, offsets and sizes
are computed. Since the native device types are not used in CPU
part of the code, the type casting remains.

Refs #3312 and #2936

Change-Id: Iaea914a474195f214ca860f7345f6878b9a04813

33 files changed:
src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_calculate_splines.cuh
src/gromacs/ewald/pme_coordinate_receiver_gpu.h
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
src/gromacs/ewald/pme_gather.cu
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_gpu_types.h
src/gromacs/ewald/pme_spread.cu
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/gpu_utils/devicebuffer_ocl.h
src/gromacs/gpu_utils/tests/CMakeLists.txt
src/gromacs/gpu_utils/tests/gpu_utils.cpp [new file with mode: 0644]
src/gromacs/mdlib/update_constrain_gpu.h
src/gromacs/mdlib/update_constrain_gpu_impl.cpp
src/gromacs/mdlib/update_constrain_gpu_impl.cu
src/gromacs/mdlib/update_constrain_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/atomdata.h
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index b7e6ff54fa70c0aa273aa3300ac177e137384a26..d32b1800c4568bc32c3bcae91c8e244066e0b1ed 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -92,7 +92,7 @@ public:
      * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
      * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
      */
-    void reinitHalo(DeviceBuffer<float> d_coordinateBuffer, DeviceBuffer<float> d_forcesBuffer);
+    void reinitHalo(DeviceBuffer<RVec> d_coordinateBuffer, DeviceBuffer<RVec> d_forcesBuffer);
 
 
     /*! \brief GPU halo exchange of coordinates buffer.
index 2511673218b3eac56b681e808c80da36989b17c6..a17c550c6c9044e77bad079c615a1e12e5ac7f39 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
 GpuHaloExchange::~GpuHaloExchange() = default;
 
 /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
-                                 DeviceBuffer<float> /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
+                                 DeviceBuffer<RVec> /* d_forcesBuffer */)
 {
     GMX_ASSERT(false,
                "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
index 660566a9dd9117343497c5d6308c8c6e255f840e..4313ffacb086e2109f68a735788bfa75b4ebe429 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -57,6 +57,7 @@
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -435,10 +436,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void
 
 GpuHaloExchange::~GpuHaloExchange() = default;
 
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> d_coordinatesBuffer, DeviceBuffer<float> d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
 {
-    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer),
-                      reinterpret_cast<float3*>(d_forcesBuffer));
+    impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
 }
 
 void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
index edbe2835230fb635368bfa04af33cce0a5215b48..f1f829e445690537f14b3811c74877ed51121a95 100644 (file)
@@ -451,8 +451,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t* GPU_FUNC_ARG
  * \param[in] pme            The PME data structure.
  * \param[in] d_x            The pointer to the positions buffer to be set
  */
-GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*    GPU_FUNC_ARGUMENT(pme),
-                                             DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_ARGUMENT(pme),
+                                             DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 
 /*! \brief Get pointer to device copy of force data.
  * \param[in] pme            The PME data structure.
index f52c81c59925aad2147f5c48a14fd5f21e6101c3..4e5dc6e8882356a9ba4ad81b83a6068a4b91312b 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
 static const bool c_useAtomDataPrefetch = true;
 
+/*! \brief Asserts if the argument is finite.
+ *
+ *  The function works for any data type, that can be casted to float. Note that there is also
+ *  a specialized implementation for float3 data type.
+ *
+ * \param[in] arg  Argument to check.
+ */
+template<typename T>
+__device__ inline void assertIsFinite(T arg);
+
+template<>
+__device__ inline void assertIsFinite(float3 arg)
+{
+    assert(isfinite(float(arg.x)));
+    assert(isfinite(float(arg.y)));
+    assert(isfinite(float(arg.z)));
+}
+
+template<typename T>
+__device__ inline void assertIsFinite(T arg)
+{
+    assert(isfinite(float(arg)));
+}
+
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
@@ -79,7 +103,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelPa
             pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
     if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
     {
-        assert(isfinite(float(gm_source[globalIndex])));
+        assertIsFinite(gm_source[globalIndex]);
         sm_destination[localIndex] = gm_source[globalIndex];
     }
 }
index 5e4c4967ba40cd0496c22dc3c6196f7f6487749e..b5d02a719e2f813ae3f228ff3dc659d256a89667 100644 (file)
@@ -43,6 +43,7 @@
 #define GMX_PMECOORDINATERECEIVERGPU_H
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -70,7 +71,7 @@ public:
      * send coordinates buffer address to PP rank
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
 
 
     /*! \brief
index c026d41b312234c3514d0916ca2370faf9a5a7cf..0cb848e6c06ff8b557c6a2cd05b285e641d30159 100644 (file)
@@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> /* d_x */)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
index 1584a9e844a2b27fc0150347bed5e52555e66847..b2e7fa009d70b73536e702f916b3dfb6e5890f10 100644 (file)
@@ -69,7 +69,7 @@ PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::
 
 PmeCoordinateReceiverGpu::Impl::~Impl() = default;
 
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
 
     int ind_start = 0;
@@ -80,7 +80,7 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(Device
         ind_end   = ind_start + receiver.numAtoms;
 
         // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start * DIM]);
+        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
 
 #if GMX_MPI
         MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
@@ -131,7 +131,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void*            pmeStr
 
 PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
 
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
 {
     impl_->sendCoordinateBufferAddressToPpRanks(d_x);
 }
index 874fb2747ea528de8d6b6291cb12f48c7fea3ee5..4f3bbe2e4e4478f29f27e9c3597c422a36f172f8 100644 (file)
@@ -69,7 +69,7 @@ public:
      * send coordinates buffer address to PP rank
      * \param[in] d_x   coordinates buffer in GPU memory
      */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
 
     /*! \brief
      * launch receive of coordinate data from PP rank
index 8b2ff5f80e7e0a3e1623bc6c10501eb50cfe48a5..616516df2335214cb16b118cb84d796d1e7a5b33 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
 #include <cassert>
 
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
 
 #include "pme.cuh"
 #include "pme_calculate_splines.cuh"
@@ -321,32 +322,27 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     }
     else
     {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
         /* Recaclulate  Splines  */
         if (c_useAtomDataPrefetch)
         {
             // charges
             __shared__ float sm_coefficients[atomsPerBlock];
             // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
             /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                             kernelParams.atoms.d_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
             __syncthreads();
-            atomX.x    = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y    = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z    = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX      = sm_coordinates[atomIndexLocal];
             atomCharge = sm_coefficients[atomIndexLocal];
         }
         else
         {
+            atomX      = gm_coordinates[atomIndexGlobal];
             atomCharge = gm_coefficients[atomIndexGlobal];
-            atomX.x    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
         }
         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
index aafd5c1a4d5a4fe888e99f940eb8000f7c396cfd..44fafc205cd37934be755c51ea34c38d415aaac5 100644 (file)
@@ -440,7 +440,7 @@ void* pme_gpu_get_device_f(const gmx_pme_t* pme)
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
 
-void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<float> d_x)
+void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
 {
     GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
index 9c984025256fcfc96c78e9d8357c63d7d0ddeef1..1aa2052907a0d643ad5a01c03fdbf3b98248def4 100644 (file)
@@ -1523,7 +1523,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
 }
 
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
 {
     GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
                "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
index cc7e9d1f342556add51c16ae31f880f1c82fd1e7..2816a22addc5bb885a7fadff1a30f5680f8f1a60 100644 (file)
@@ -395,7 +395,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu*                GPU_FUNC_ARGUMENT(
  * \param[in] d_x            Pointer to coordinate data
  */
 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
-                                                            DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+                                                            DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 
 /*! \brief Return pointer to device copy of force data.
  * \param[in] pmeGpu         The PME GPU structure.
index be501f2cfa83f953e69d63047e63762c5dcbffaa..5ddd79fb5a39618557d717fbd48c7bc3ae33dc94 100644 (file)
@@ -157,7 +157,7 @@ struct PmeGpuAtomParams
      * The coordinates themselves change and need to be copied to the GPU for every PME computation,
      * but reallocation happens only at DD.
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coordinates;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
     /*! \brief Global GPU memory array handle with input atom charges.
      * The charges only need to be reallocated and copied to the GPU at DD step.
      */
index 3d02e43d0d1dc5b3c63452f7821a71b0cd9782bd..99f7828c86707276d6e976601086d4cd93abf1ae 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,6 +47,7 @@
 #include <cassert>
 
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
 
 #include "pme.cuh"
 #include "pme_calculate_splines.cuh"
@@ -228,24 +229,20 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
 
     if (computeSplines)
     {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
         if (c_useAtomDataPrefetch)
         {
             // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
 
             /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
             __syncthreads();
-            atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX = sm_coordinates[atomIndexLocal];
         }
         else
         {
-            atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
+            atomX = gm_coordinates[atomIndexGlobal];
         }
         calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
index 71d9b7dac45dac38ac76b96357cdec6b91e0acde..6f41e8589cef48b528bc3e1b48d2967764ae4c0d 100644 (file)
@@ -215,6 +215,19 @@ static inline void rvec_inc(rvec a, const float3 b)
     rvec tmp = { b.x, b.y, b.z };
     rvec_inc(a, tmp);
 }
+/*! \brief Cast RVec buffer to float3 buffer.
+ *
+ * \param[in] in The RVec buffer to cast.
+ *
+ * \returns Buffer, casted to float3*.
+ */
+static inline __host__ __device__ float3* asFloat3(gmx::RVec* in)
+{
+    static_assert(sizeof(in[0]) == sizeof(float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<float3*>(in);
+}
 
 /*! \brief Wait for all taks in stream \p s to complete.
  *
index 2c92c81c1be361d2cbdee1f282bbe108d7d792e0..40f1e12941c5f40d010459abb4394d8ccdf22458 100644 (file)
@@ -228,7 +228,7 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
     GMX_ASSERT(buffer, "needs a buffer pointer");
     const size_t    offset        = startingOffset * sizeof(ValueType);
     const size_t    bytes         = numValues * sizeof(ValueType);
-    const ValueType pattern       = 0;
+    const int       pattern       = 0;
     const cl_uint   numWaitEvents = 0;
     const cl_event* waitEvents    = nullptr;
     cl_event        commandEvent;
index c2fc3426bb09a97d7727b1b60ae05fde192c038d..e9a62b792b2df59736590cb43298492a031ca766 100644 (file)
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@ if(GMX_USE_CUDA)
     # CUDA-only test
     list(APPEND SOURCES_FROM_CXX
         pinnedmemorychecker.cpp
+        gpu_utils.cpp
         )
     # TODO Making a separate library is heavy handed, but nothing else
     # seems to work. Also don't use a hyphen in its name, because nvcc
diff --git a/src/gromacs/gpu_utils/tests/gpu_utils.cpp b/src/gromacs/gpu_utils/tests/gpu_utils.cpp
new file mode 100644 (file)
index 0000000..5adf0a7
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for CUDA float3 type layout.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <vector>
+
+#ifndef __CUDA_ARCH__
+/*! \brief Dummy definition to avoid compiler error
+ *
+ * \todo Find a better solution. Probably, move asFloat3(...) function to different header.
+ */
+#    define __CUDA_ARCH__ -1
+#    include <cuda_runtime.h>
+#    undef __CUDA_ARCH__
+#else
+#    include <cuda_runtime.h>
+#endif
+#include <gtest/gtest.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/real.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+namespace gmx
+{
+
+namespace test
+{
+
+TEST(GpuDataTypesCompatibilityTest, RVecAndFloat3)
+{
+    std::vector<RVec> dataRVec;
+    dataRVec.emplace_back(1.0, 2.0, 3.0);
+    dataRVec.emplace_back(4.0, 5.0, 6.0);
+    float3* dataFloat3 = asFloat3(dataRVec.data());
+    EXPECT_EQ(dataFloat3[0].x, dataRVec[0][XX]);
+    EXPECT_EQ(dataFloat3[0].y, dataRVec[0][YY]);
+    EXPECT_EQ(dataFloat3[0].z, dataRVec[0][ZZ]);
+    EXPECT_EQ(dataFloat3[1].x, dataRVec[1][XX]);
+    EXPECT_EQ(dataFloat3[1].y, dataRVec[1][YY]);
+    EXPECT_EQ(dataFloat3[1].z, dataRVec[1][ZZ]);
+}
+
+} // namespace test
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_CUDA
\ No newline at end of file
index e0190c367f586d75680820dd3315871461405443..359ecea5c16ce741c45cfd720b923cdfa5b9c912 100644 (file)
@@ -133,12 +133,12 @@ public:
      * \param[in]      md                  Atoms data.
      * \param[in]      numTempScaleValues  Number of temperature scaling groups. Zero for no temperature scaling.
      */
-    void set(DeviceBuffer<float> d_x,
-             DeviceBuffer<float> d_v,
-             DeviceBuffer<float> d_f,
-             const t_idef&       idef,
-             const t_mdatoms&    md,
-             int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec> d_x,
+             DeviceBuffer<RVec> d_v,
+             DeviceBuffer<RVec> d_f,
+             const t_idef&      idef,
+             const t_mdatoms&   md,
+             int                numTempScaleValues);
 
     /*! \brief
      * Update PBC data.
index 919ba9b36b97a81b5f86e0a8200752be1956321f..47671ef7de95cc85e5d5b13fb71722b1723afbc2 100644 (file)
@@ -88,9 +88,9 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */)
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<float> /* d_x */,
-                             DeviceBuffer<float> /* d_v */,
-                             const DeviceBuffer<float> /* d_f */,
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
+                             DeviceBuffer<RVec> /* d_v */,
+                             const DeviceBuffer<RVec> /* d_f */,
                              const t_idef& /* idef */,
                              const t_mdatoms& /* md */,
                              const int /* numTempScaleValues */)
index ebbe15a02ff5059be0305a748f05651ad9fe4a33..6991ef0dc351ca2732eb0e3e46c3c68eadd959fc 100644 (file)
@@ -188,12 +188,12 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
 
 UpdateConstrainGpu::Impl::~Impl() {}
 
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<float>       d_x,
-                                   DeviceBuffer<float>       d_v,
-                                   const DeviceBuffer<float> d_f,
-                                   const t_idef&             idef,
-                                   const t_mdatoms&          md,
-                                   const int                 numTempScaleValues)
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>       d_x,
+                                   DeviceBuffer<RVec>       d_v,
+                                   const DeviceBuffer<RVec> d_f,
+                                   const t_idef&            idef,
+                                   const t_mdatoms&         md,
+                                   const int                numTempScaleValues)
 {
     GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
     GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
@@ -259,12 +259,12 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix)
     impl_->scaleCoordinates(scalingMatrix);
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<float>       d_x,
-                             DeviceBuffer<float>       d_v,
-                             const DeviceBuffer<float> d_f,
-                             const t_idef&             idef,
-                             const t_mdatoms&          md,
-                             const int                 numTempScaleValues)
+void UpdateConstrainGpu::set(DeviceBuffer<RVec>       d_x,
+                             DeviceBuffer<RVec>       d_v,
+                             const DeviceBuffer<RVec> d_f,
+                             const t_idef&            idef,
+                             const t_mdatoms&         md,
+                             const int                numTempScaleValues)
 {
     impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
 }
index 5a28045afd48e82b21ec62cb91864e06233ed81e..0009112dc624772552987111b4ac958be14b327d 100644 (file)
@@ -133,12 +133,12 @@ public:
      * \param[in] md                  Atoms data.
      * \param[in] numTempScaleValues  Number of temperature scaling groups. Set zero for no temperature coupling.
      */
-    void set(DeviceBuffer<float>       d_x,
-             DeviceBuffer<float>       d_v,
-             const DeviceBuffer<float> d_f,
-             const t_idef&             idef,
-             const t_mdatoms&          md,
-             const int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec>       d_x,
+             DeviceBuffer<RVec>       d_v,
+             const DeviceBuffer<RVec> d_f,
+             const t_idef&            idef,
+             const t_mdatoms&         md,
+             const int                numTempScaleValues);
 
     /*! \brief
      * Update PBC data.
index 44cbc8d7e53887f0aa75abc272dbbe15bcb2f589..d75cd78ea1e2b742d559580b7d92ccfe5144a019 100644 (file)
@@ -184,7 +184,7 @@ public:
      *
      *  \returns GPU positions buffer.
      */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
 
     /*! \brief Copy positions to the GPU memory.
      *
@@ -245,7 +245,7 @@ public:
      *
      *  \returns GPU velocities buffer.
      */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
 
     /*! \brief Copy velocities to the GPU memory.
      *
@@ -280,7 +280,7 @@ public:
      *
      *  \returns GPU force buffer.
      */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
 
     /*! \brief Copy forces to the GPU memory.
      *
index ae6bcd6c71eef3d217c1c46e46c79604d7078caf..15f054eafaa59354782743d04aab68e2b9d9d84a 100644 (file)
@@ -95,12 +95,12 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
     return std::make_tuple(0, 0);
 }
 
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
@@ -153,12 +153,12 @@ void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /*
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
@@ -193,12 +193,12 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality /* atomLocal
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
 {
     GMX_ASSERT(false,
                "A CPU stub method from GPU state propagator data was called instead of one from "
                "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
 }
 
 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f          */,
index 142327772274c4c29db5b922a43b18569868fecc..a138bcbb624f6d48f0dba52f1223a9427b4113ad 100644 (file)
@@ -181,7 +181,7 @@ public:
      *
      *  \returns GPU positions buffer.
      */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
 
     /*! \brief Copy positions to the GPU memory.
      *
@@ -242,7 +242,7 @@ public:
      *
      *  \returns GPU velocities buffer.
      */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
 
     /*! \brief Copy velocities to the GPU memory.
      *
@@ -277,7 +277,7 @@ public:
      *
      *  \returns GPU force buffer.
      */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
 
     /*! \brief Copy forces to the GPU memory.
      *
@@ -395,21 +395,21 @@ private:
     int numAtomsAll_ = -1;
 
     //! Device positions buffer
-    DeviceBuffer<float> d_x_;
+    DeviceBuffer<RVec> d_x_;
     //! Number of particles saved in the positions buffer
     int d_xSize_ = -1;
     //! Allocation size for the positions buffer
     int d_xCapacity_ = -1;
 
     //! Device velocities buffer
-    DeviceBuffer<float> d_v_;
+    DeviceBuffer<RVec> d_v_;
     //! Number of particles saved in the velocities buffer
     int d_vSize_ = -1;
     //! Allocation size for the velocities buffer
     int d_vCapacity_ = -1;
 
     //! Device force buffer
-    DeviceBuffer<float> d_f_;
+    DeviceBuffer<RVec> d_f_;
     //! Number of particles saved in the force buffer
     int d_fSize_ = -1;
     //! Allocation size for the force buffer
@@ -428,7 +428,7 @@ private:
      *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
      *  \param[in]  commandStream  GPU stream to execute copy in.
      */
-    void copyToDevice(DeviceBuffer<float>            d_data,
+    void copyToDevice(DeviceBuffer<RVec>             d_data,
                       gmx::ArrayRef<const gmx::RVec> h_data,
                       int                            dataSize,
                       AtomLocality                   atomLocality,
@@ -443,7 +443,7 @@ private:
      *  \param[in]  commandStream  GPU stream to execute copy in.
      */
     void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                        DeviceBuffer<float>      d_data,
+                        DeviceBuffer<RVec>       d_data,
                         int                      dataSize,
                         AtomLocality             atomLocality,
                         CommandStream            commandStream);
index 031327e80d844b1917b4c65c5317e95aac721b56..4b385a5a7b56ca5b82a22c57bb648a127b6b731d 100644 (file)
@@ -193,18 +193,18 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
         numAtomsPadded = numAtomsAll_;
     }
 
-    reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
 
     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
     if (paddingAllocationSize > 0)
     {
         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
-        clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_);
+        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
     }
 
-    reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
     const int d_fOldCapacity = d_fCapacity_;
-    reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
     // since the force buffer ops are not implemented in OpenCL.
@@ -249,7 +249,7 @@ std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality
     return std::make_tuple(atomsStartAt, numAtomsToCopy);
 }
 
-void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                  d_data,
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
                                                 int                                  dataSize,
                                                 AtomLocality                         atomLocality,
@@ -269,18 +269,15 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
     int atomsStartAt, numAtomsToCopy;
     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
     if (numAtomsToCopy != 0)
     {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                    "The device allocation is smaller than requested copy range.");
         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                    "The host buffer is smaller than the requested copy range.");
 
-        copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
-                           elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
+                           atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -288,7 +285,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
 }
 
 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                                                  DeviceBuffer<float>      d_data,
+                                                  DeviceBuffer<RVec>       d_data,
                                                   int                      dataSize,
                                                   AtomLocality             atomLocality,
                                                   CommandStream            commandStream)
@@ -307,25 +304,22 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
     int atomsStartAt, numAtomsToCopy;
     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
     if (numAtomsToCopy != 0)
     {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                    "The device allocation is smaller than requested copy range.");
         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                    "The host buffer is smaller than the requested copy range.");
 
-        copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
-                             elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
+                             atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
     }
 
     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
 {
     return d_x_;
 }
@@ -422,7 +416,7 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomL
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
 {
     return d_v_;
 }
@@ -476,7 +470,7 @@ void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLo
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
 {
     return d_f_;
 }
@@ -595,7 +589,7 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 {
     return impl_->getCoordinates();
 }
@@ -635,7 +629,7 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocalit
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 {
     return impl_->getVelocities();
 }
@@ -662,7 +656,7 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality
 }
 
 
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
 {
     return impl_->getForces();
 }
index 89d2e762b8f6dca8107daeb64c373333585aec7d..cd5e346303471796d065a316991b7bbfe1bb59a1 100644 (file)
@@ -1075,7 +1075,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                     const gmx::AtomLocality locality,
                                     bool                    fillLocal,
                                     NbnxmGpu*               gpu_nbv,
-                                    DeviceBuffer<float>     d_x,
+                                    DeviceBuffer<RVec>      d_x,
                                     GpuEventSynchronizer*   xReadyOnDevice)
 {
 
@@ -1459,7 +1459,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
 void reduceForcesGpu(const gmx::AtomLocality                    locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<RVec>                         totalForcesDevice,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
index ceb87f71f093f5a99e4d86005f3c770a952cb60c..22d115706e0a394f3f4f55e95676ff07675548bc 100644 (file)
@@ -376,12 +376,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet,
  * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
  * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
-                                    gmx::AtomLocality     locality,
-                                    bool                  fillLocal,
-                                    NbnxmGpu*             gpu_nbv,
-                                    DeviceBuffer<float>   d_x,
-                                    GpuEventSynchronizer* xReadyOnDevice);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
+                                    gmx::AtomLocality       locality,
+                                    bool                    fillLocal,
+                                    NbnxmGpu*               gpu_nbv,
+                                    DeviceBuffer<gmx::RVec> d_x,
+                                    GpuEventSynchronizer*   xReadyOnDevice);
 
 /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
  *
@@ -404,7 +404,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnx
  * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
  */
 void reduceForcesGpu(gmx::AtomLocality                          locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
index 94e99879db0e3c54976e19a24cf030885143ca7e..2defa174d647f2a34f4fad32acba32523df1615e 100644 (file)
@@ -818,7 +818,7 @@ void cuda_set_cacheconfig()
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                            bool                      setFillerCoords,
                            NbnxmGpu*                 nb,
-                           DeviceBuffer<float>       d_x,
+                           DeviceBuffer<gmx::RVec>   d_x,
                            GpuEventSynchronizer*     xReadyOnDevice,
                            const Nbnxm::AtomLocality locality,
                            int                       gridId,
@@ -862,12 +862,13 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
                                         : nbnxn_gpu_x_to_nbat_x_kernel<false>;
         float4*    d_xq          = adat->xq;
+        float3*    d_xFloat3     = asFloat3(d_x);
         const int* d_atomIndices = nb->atomIndices;
         const int* d_cxy_na      = &nb->cxy_na[numColumnsMax * gridId];
         const int* d_cxy_ind     = &nb->cxy_ind[numColumnsMax * gridId];
-        const auto kernelArgs =
-                prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_x, &d_atomIndices,
-                                          &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
+        const auto kernelArgs    = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq,
+                                                          &d_xFloat3, &d_atomIndices, &d_cxy_na,
+                                                          &d_cxy_ind, &cellOffset, &numAtomsPerCell);
         launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
     }
 
@@ -884,7 +885,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
  *       forces only after Local stream already done so.
  */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
-                               DeviceBuffer<float>                        totalForcesDevice,
+                               DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                NbnxmGpu*                                  nb,
                                void*                                      pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
@@ -935,8 +936,8 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     }
 
     const float3* d_fNB    = adat->f;
-    const float3* d_fPme   = (float3*)pmeForcesDevice;
-    float3*       d_fTotal = (float3*)totalForcesDevice;
+    const float3* d_fPme   = static_cast<float3*>(pmeForcesDevice);
+    float3*       d_fTotal = asFloat3(totalForcesDevice);
     const int*    d_cell   = nb->cell;
 
     const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
index 07b24fc923e0a94de18b885412f52957879677e0..ab79a8cf5b8482d453348d108e48e59fa07a77e5 100644 (file)
@@ -140,7 +140,7 @@ void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality        local
 
 void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                const bool              fillLocal,
-                                               DeviceBuffer<float>     d_x,
+                                               DeviceBuffer<gmx::RVec> d_x,
                                                GpuEventSynchronizer*   xReadyOnDevice)
 {
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -178,7 +178,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
 }
 
 void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
-                                                      DeviceBuffer<float>     totalForcesDevice,
+                                                      DeviceBuffer<gmx::RVec> totalForcesDevice,
                                                       void*                   forcesPmeDevice,
                                                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                                       bool useGpuFPmeReduction,
index 50fed0fe5bac947273c3369653604aba990c25b9..2fa353a8486a5aaa8f8081dd9969f373eb7e2abf 100644 (file)
@@ -286,10 +286,10 @@ public:
      * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
      * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
      */
-    void convertCoordinatesGpu(gmx::AtomLocality     locality,
-                               bool                  fillLocal,
-                               DeviceBuffer<float>   d_x,
-                               GpuEventSynchronizer* xReadyOnDevice);
+    void convertCoordinatesGpu(gmx::AtomLocality       locality,
+                               bool                    fillLocal,
+                               DeviceBuffer<gmx::RVec> d_x,
+                               GpuEventSynchronizer*   xReadyOnDevice);
 
     //! Init for GPU version of setup coordinates in Nbnxm
     void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -349,7 +349,7 @@ public:
      * \param [in]     accumulateForce      If the total force buffer already contains data
      */
     void atomdata_add_nbat_f_to_f_gpu(gmx::AtomLocality                          locality,
-                                      DeviceBuffer<float>                        totalForcesDevice,
+                                      DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                       void*                                      forcesPmeDevice,
                                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                       bool useGpuFPmeReduction,
index 7b9e4b80f9f1df63ebab8cc59a0fdb7ba983244f..2370fc836bf352f5f31e984556ae6848e6ae7f7b 100644 (file)
@@ -242,8 +242,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                            bool gmx_unused setFillerCoords,
-                           NbnxmGpu gmx_unused* gpu_nbv,
-                           DeviceBuffer<float> gmx_unused d_x,
+                           NbnxmGpu gmx_unused*    gpu_nbv,
+                           DeviceBuffer<gmx::RVec> gmx_unused d_x,
                            GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                            gmx::AtomLocality gmx_unused locality,
                            int gmx_unused gridId,
@@ -312,7 +312,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
-                               DeviceBuffer<float> gmx_unused totalForcesDevice,
+                               DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
                                NbnxmGpu gmx_unused* gpu_nbv,
                                void gmx_unused*                           pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,