Use RVec instead of float for x, v and f device buffers

author Artem Zhmurov <zhmurov@gmail.com>

Mon, 20 Jan 2020 17:32:46 +0000 (18:32 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Thu, 6 Feb 2020 07:49:39 +0000 (08:49 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Mon, 20 Jan 2020 17:32:46 +0000 (18:32 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Thu, 6 Feb 2020 07:49:39 +0000 (08:49 +0100)
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index b7e6ff54fa70c0aa273aa3300ac177e137384a26..d32b1800c4568bc32c3bcae91c8e244066e0b1ed 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -92,7 +92,7 @@ public:
       * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
       * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
       */
-    void reinitHalo(DeviceBuffer<float> d_coordinateBuffer, DeviceBuffer<float> d_forcesBuffer);
+    void reinitHalo(DeviceBuffer<RVec> d_coordinateBuffer, DeviceBuffer<RVec> d_forcesBuffer);
  
  
      /*! \brief GPU halo exchange of coordinates buffer.
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index 2511673218b3eac56b681e808c80da36989b17c6..a17c550c6c9044e77bad079c615a1e12e5ac7f39 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
  GpuHaloExchange::~GpuHaloExchange() = default;
  
  /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
-                                 DeviceBuffer<float> /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
+                                 DeviceBuffer<RVec> /* d_forcesBuffer */)
  {
      GMX_ASSERT(false,
                 "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 660566a9dd9117343497c5d6308c8c6e255f840e..4313ffacb086e2109f68a735788bfa75b4ebe429 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -57,6 +57,7 @@
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vectypes.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/utility/gmxmpi.h"
  
@@ -435,10 +436,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void
  
  GpuHaloExchange::~GpuHaloExchange() = default;
  
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> d_coordinatesBuffer, DeviceBuffer<float> d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
  {
-    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer),
-                      reinterpret_cast<float3*>(d_forcesBuffer));
+    impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
  }
  
  void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index edbe2835230fb635368bfa04af33cce0a5215b48..f1f829e445690537f14b3811c74877ed51121a95 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -451,8 +451,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t* GPU_FUNC_ARG
   * \param[in] pme            The PME data structure.
   * \param[in] d_x            The pointer to the positions buffer to be set
   */
-GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*    GPU_FUNC_ARGUMENT(pme),
-                                             DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_ARGUMENT(pme),
+                                             DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
  
  /*! \brief Get pointer to device copy of force data.
   * \param[in] pme            The PME data structure.
diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_calculate_splines.cuh

index f52c81c59925aad2147f5c48a14fd5f21e6101c3..4e5dc6e8882356a9ba4ad81b83a6068a4b91312b 100644 (file)
--- a/src/gromacs/ewald/pme_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_calculate_splines.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -52,6 +52,30 @@
  //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
  static const bool c_useAtomDataPrefetch = true;
  
+/*! \brief Asserts if the argument is finite.
+ *
+ *  The function works for any data type, that can be casted to float. Note that there is also
+ *  a specialized implementation for float3 data type.
+ *
+ * \param[in] arg  Argument to check.
+ */
+template<typename T>
+__device__ inline void assertIsFinite(T arg);
+
+template<>
+__device__ inline void assertIsFinite(float3 arg)
+{
+    assert(isfinite(float(arg.x)));
+    assert(isfinite(float(arg.y)));
+    assert(isfinite(float(arg.z)));
+}
+
+template<typename T>
+__device__ inline void assertIsFinite(T arg)
+{
+    assert(isfinite(float(arg)));
+}
+
  /*! \brief
   * General purpose function for loading atom-related data from global to shared memory.
   *
@@ -79,7 +103,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelPa
              pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
      if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
      {
-        assert(isfinite(float(gm_source[globalIndex])));
+        assertIsFinite(gm_source[globalIndex]);
          sm_destination[localIndex] = gm_source[globalIndex];
      }
  }
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h

index 5e4c4967ba40cd0496c22dc3c6196f7f6487749e..b5d02a719e2f813ae3f228ff3dc659d256a89667 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -43,6 +43,7 @@
  #define GMX_PMECOORDINATERECEIVERGPU_H
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/gmxmpi.h"
  
@@ -70,7 +71,7 @@ public:
       * send coordinates buffer address to PP rank
       * \param[in] d_x   coordinates buffer in GPU memory
       */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
  
  
      /*! \brief
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp

index c026d41b312234c3514d0916ca2370faf9a5a7cf..0cb848e6c06ff8b557c6a2cd05b285e641d30159 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
  /*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> /* d_x */)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
  {
      GMX_ASSERT(false,
                 "A CPU stub for PME-PP GPU communication initialization was called instead of the "
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu

index 1584a9e844a2b27fc0150347bed5e52555e66847..b2e7fa009d70b73536e702f916b3dfb6e5890f10 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -69,7 +69,7 @@ PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::
  
  PmeCoordinateReceiverGpu::Impl::~Impl() = default;
  
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
  {
  
      int ind_start = 0;
@@ -80,7 +80,7 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(Device
          ind_end   = ind_start + receiver.numAtoms;
  
          // Data will be transferred directly from GPU.
-        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start * DIM]);
+        void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
  
  #if GMX_MPI
          MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
@@ -131,7 +131,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void*            pmeStr
  
  PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
  
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
  {
      impl_->sendCoordinateBufferAddressToPpRanks(d_x);
  }
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h

index 874fb2747ea528de8d6b6291cb12f48c7fea3ee5..4f3bbe2e4e4478f29f27e9c3597c422a36f172f8 100644 (file)
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -69,7 +69,7 @@ public:
       * send coordinates buffer address to PP rank
       * \param[in] d_x   coordinates buffer in GPU memory
       */
-    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+    void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
  
      /*! \brief
       * launch receive of coordinate data from PP rank
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index 8b2ff5f80e7e0a3e1623bc6c10501eb50cfe48a5..616516df2335214cb16b118cb84d796d1e7a5b33 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
  #include <cassert>
  
  #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
  
  #include "pme.cuh"
  #include "pme_calculate_splines.cuh"
@@ -321,32 +322,27 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
      }
      else
      {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
          /* Recaclulate  Splines  */
          if (c_useAtomDataPrefetch)
          {
              // charges
              __shared__ float sm_coefficients[atomsPerBlock];
              // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
              /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
-                                                             kernelParams.atoms.d_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
  
              /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
              __syncthreads();
-            atomX.x    = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y    = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z    = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX      = sm_coordinates[atomIndexLocal];
              atomCharge = sm_coefficients[atomIndexLocal];
          }
          else
          {
+            atomX      = gm_coordinates[atomIndexGlobal];
              atomCharge = gm_coefficients[atomIndexGlobal];
-            atomX.x    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
          }
          calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
                  kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index aafd5c1a4d5a4fe888e99f940eb8000f7c396cfd..44fafc205cd37934be755c51ea34c38d415aaac5 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -440,7 +440,7 @@ void* pme_gpu_get_device_f(const gmx_pme_t* pme)
      return pme_gpu_get_kernelparam_forces(pme->gpu);
  }
  
-void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<float> d_x)
+void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
  {
      GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 9c984025256fcfc96c78e9d8357c63d7d0ddeef1..1aa2052907a0d643ad5a01c03fdbf3b98248def4 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1523,7 +1523,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
      }
  }
  
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
  {
      GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
                 "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index cc7e9d1f342556add51c16ae31f880f1c82fd1e7..2816a22addc5bb885a7fadff1a30f5680f8f1a60 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -395,7 +395,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu*                GPU_FUNC_ARGUMENT(
   * \param[in] d_x            Pointer to coordinate data
   */
  GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
-                                                            DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+                                                            DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
  
  /*! \brief Return pointer to device copy of force data.
   * \param[in] pmeGpu         The PME GPU structure.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h

index be501f2cfa83f953e69d63047e63762c5dcbffaa..5ddd79fb5a39618557d717fbd48c7bc3ae33dc94 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -157,7 +157,7 @@ struct PmeGpuAtomParams
       * The coordinates themselves change and need to be copied to the GPU for every PME computation,
       * but reallocation happens only at DD.
       */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coordinates;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
      /*! \brief Global GPU memory array handle with input atom charges.
       * The charges only need to be reallocated and copied to the GPU at DD step.
       */
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index 3d02e43d0d1dc5b3c63452f7821a71b0cd9782bd..99f7828c86707276d6e976601086d4cd93abf1ae 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -47,6 +47,7 @@
  #include <cassert>
  
  #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
  
  #include "pme.cuh"
  #include "pme_calculate_splines.cuh"
@@ -228,24 +229,20 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
  
      if (computeSplines)
      {
+        const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
          if (c_useAtomDataPrefetch)
          {
              // Coordinates
-            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            __shared__ float3 sm_coordinates[atomsPerBlock];
  
              /* Staging coordinates */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
-                                                               kernelParams.atoms.d_coordinates);
+            pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
              __syncthreads();
-            atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
-            atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
-            atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
+            atomX = sm_coordinates[atomIndexLocal];
          }
          else
          {
-            atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
-            atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
-            atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
+            atomX = gm_coordinates[atomIndexGlobal];
          }
          calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
                  kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index 71d9b7dac45dac38ac76b96357cdec6b91e0acde..6f41e8589cef48b528bc3e1b48d2967764ae4c0d 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -215,6 +215,19 @@ static inline void rvec_inc(rvec a, const float3 b)
      rvec tmp = { b.x, b.y, b.z };
      rvec_inc(a, tmp);
  }
+/*! \brief Cast RVec buffer to float3 buffer.
+ *
+ * \param[in] in The RVec buffer to cast.
+ *
+ * \returns Buffer, casted to float3*.
+ */
+static inline __host__ __device__ float3* asFloat3(gmx::RVec* in)
+{
+    static_assert(sizeof(in[0]) == sizeof(float3),
+                  "Size of the host-side data-type is different from the size of the device-side "
+                  "counterpart.");
+    return reinterpret_cast<float3*>(in);
+}
  
  /*! \brief Wait for all taks in stream \p s to complete.
   *
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h

index 2c92c81c1be361d2cbdee1f282bbe108d7d792e0..40f1e12941c5f40d010459abb4394d8ccdf22458 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -228,7 +228,7 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
      GMX_ASSERT(buffer, "needs a buffer pointer");
      const size_t    offset        = startingOffset * sizeof(ValueType);
      const size_t    bytes         = numValues * sizeof(ValueType);
-    const ValueType pattern       = 0;
+    const int       pattern       = 0;
      const cl_uint   numWaitEvents = 0;
      const cl_event* waitEvents    = nullptr;
      cl_event        commandEvent;
diff --git a/src/gromacs/gpu_utils/tests/CMakeLists.txt b/src/gromacs/gpu_utils/tests/CMakeLists.txt

index c2fc3426bb09a97d7727b1b60ae05fde192c038d..e9a62b792b2df59736590cb43298492a031ca766 100644 (file)
--- a/src/gromacs/gpu_utils/tests/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@ if(GMX_USE_CUDA)
      # CUDA-only test
      list(APPEND SOURCES_FROM_CXX
          pinnedmemorychecker.cpp
+        gpu_utils.cpp
          )
      # TODO Making a separate library is heavy handed, but nothing else
      # seems to work. Also don't use a hyphen in its name, because nvcc
diff --git a/src/gromacs/gpu_utils/tests/gpu_utils.cpp b/src/gromacs/gpu_utils/tests/gpu_utils.cpp

new file mode 100644 (file)

index 0000000..5adf0a7
--- /dev/null
+++ b/src/gromacs/gpu_utils/tests/gpu_utils.cpp
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for CUDA float3 type layout.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <vector>
+
+#ifndef __CUDA_ARCH__
+/*! \brief Dummy definition to avoid compiler error
+ *
+ * \todo Find a better solution. Probably, move asFloat3(...) function to different header.
+ */
+#    define __CUDA_ARCH__ -1
+#    include <cuda_runtime.h>
+#    undef __CUDA_ARCH__
+#else
+#    include <cuda_runtime.h>
+#endif
+#include <gtest/gtest.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/real.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+namespace gmx
+{
+
+namespace test
+{
+
+TEST(GpuDataTypesCompatibilityTest, RVecAndFloat3)
+{
+    std::vector<RVec> dataRVec;
+    dataRVec.emplace_back(1.0, 2.0, 3.0);
+    dataRVec.emplace_back(4.0, 5.0, 6.0);
+    float3* dataFloat3 = asFloat3(dataRVec.data());
+    EXPECT_EQ(dataFloat3[0].x, dataRVec[0][XX]);
+    EXPECT_EQ(dataFloat3[0].y, dataRVec[0][YY]);
+    EXPECT_EQ(dataFloat3[0].z, dataRVec[0][ZZ]);
+    EXPECT_EQ(dataFloat3[1].x, dataRVec[1][XX]);
+    EXPECT_EQ(dataFloat3[1].y, dataRVec[1][YY]);
+    EXPECT_EQ(dataFloat3[1].z, dataRVec[1][ZZ]);
+}
+
+} // namespace test
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_CUDA
+\ No newline at end of file
diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h

index e0190c367f586d75680820dd3315871461405443..359ecea5c16ce741c45cfd720b923cdfa5b9c912 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu.h
+++ b/src/gromacs/mdlib/update_constrain_gpu.h
@@ -133,12 +133,12 @@ public:
       * \param[in]      md                  Atoms data.
       * \param[in]      numTempScaleValues  Number of temperature scaling groups. Zero for no temperature scaling.
       */
-    void set(DeviceBuffer<float> d_x,
-             DeviceBuffer<float> d_v,
-             DeviceBuffer<float> d_f,
-             const t_idef&       idef,
-             const t_mdatoms&    md,
-             int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec> d_x,
+             DeviceBuffer<RVec> d_v,
+             DeviceBuffer<RVec> d_f,
+             const t_idef&      idef,
+             const t_mdatoms&   md,
+             int                numTempScaleValues);
  
      /*! \brief
       * Update PBC data.
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp

index 919ba9b36b97a81b5f86e0a8200752be1956321f..47671ef7de95cc85e5d5b13fb71722b1723afbc2 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -88,9 +88,9 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */)
                 "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
  }
  
-void UpdateConstrainGpu::set(DeviceBuffer<float> /* d_x */,
-                             DeviceBuffer<float> /* d_v */,
-                             const DeviceBuffer<float> /* d_f */,
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
+                             DeviceBuffer<RVec> /* d_v */,
+                             const DeviceBuffer<RVec> /* d_f */,
                               const t_idef& /* idef */,
                               const t_mdatoms& /* md */,
                               const int /* numTempScaleValues */)
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu

index ebbe15a02ff5059be0305a748f05651ad9fe4a33..6991ef0dc351ca2732eb0e3e46c3c68eadd959fc 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -188,12 +188,12 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
  
  UpdateConstrainGpu::Impl::~Impl() {}
  
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<float>       d_x,
-                                   DeviceBuffer<float>       d_v,
-                                   const DeviceBuffer<float> d_f,
-                                   const t_idef&             idef,
-                                   const t_mdatoms&          md,
-                                   const int                 numTempScaleValues)
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>       d_x,
+                                   DeviceBuffer<RVec>       d_v,
+                                   const DeviceBuffer<RVec> d_f,
+                                   const t_idef&            idef,
+                                   const t_mdatoms&         md,
+                                   const int                numTempScaleValues)
  {
      GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
      GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
@@ -259,12 +259,12 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix)
      impl_->scaleCoordinates(scalingMatrix);
  }
  
-void UpdateConstrainGpu::set(DeviceBuffer<float>       d_x,
-                             DeviceBuffer<float>       d_v,
-                             const DeviceBuffer<float> d_f,
-                             const t_idef&             idef,
-                             const t_mdatoms&          md,
-                             const int                 numTempScaleValues)
+void UpdateConstrainGpu::set(DeviceBuffer<RVec>       d_x,
+                             DeviceBuffer<RVec>       d_v,
+                             const DeviceBuffer<RVec> d_f,
+                             const t_idef&            idef,
+                             const t_mdatoms&         md,
+                             const int                numTempScaleValues)
  {
      impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
  }
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h

index 5a28045afd48e82b21ec62cb91864e06233ed81e..0009112dc624772552987111b4ac958be14b327d 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -133,12 +133,12 @@ public:
       * \param[in] md                  Atoms data.
       * \param[in] numTempScaleValues  Number of temperature scaling groups. Set zero for no temperature coupling.
       */
-    void set(DeviceBuffer<float>       d_x,
-             DeviceBuffer<float>       d_v,
-             const DeviceBuffer<float> d_f,
-             const t_idef&             idef,
-             const t_mdatoms&          md,
-             const int                 numTempScaleValues);
+    void set(DeviceBuffer<RVec>       d_x,
+             DeviceBuffer<RVec>       d_v,
+             const DeviceBuffer<RVec> d_f,
+             const t_idef&            idef,
+             const t_mdatoms&         md,
+             const int                numTempScaleValues);
  
      /*! \brief
       * Update PBC data.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 44cbc8d7e53887f0aa75abc272dbbe15bcb2f589..d75cd78ea1e2b742d559580b7d92ccfe5144a019 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -184,7 +184,7 @@ public:
       *
       *  \returns GPU positions buffer.
       */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
  
      /*! \brief Copy positions to the GPU memory.
       *
@@ -245,7 +245,7 @@ public:
       *
       *  \returns GPU velocities buffer.
       */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
  
      /*! \brief Copy velocities to the GPU memory.
       *
@@ -280,7 +280,7 @@ public:
       *
       *  \returns GPU force buffer.
       */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
  
      /*! \brief Copy forces to the GPU memory.
       *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index ae6bcd6c71eef3d217c1c46e46c79604d7078caf..15f054eafaa59354782743d04aab68e2b9d9d84a 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -95,12 +95,12 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
      return std::make_tuple(0, 0);
  }
  
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
  {
      GMX_ASSERT(false,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
                 "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
  }
  
  GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
@@ -153,12 +153,12 @@ void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /*
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
  {
      GMX_ASSERT(false,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
                 "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
  }
  
  void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
@@ -193,12 +193,12 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality /* atomLocal
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
  {
      GMX_ASSERT(false,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
                 "GPU implementation.");
-    return DeviceBuffer<float>{};
+    return {};
  }
  
  void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f          */,
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 142327772274c4c29db5b922a43b18569868fecc..a138bcbb624f6d48f0dba52f1223a9427b4113ad 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -181,7 +181,7 @@ public:
       *
       *  \returns GPU positions buffer.
       */
-    DeviceBuffer<float> getCoordinates();
+    DeviceBuffer<RVec> getCoordinates();
  
      /*! \brief Copy positions to the GPU memory.
       *
@@ -242,7 +242,7 @@ public:
       *
       *  \returns GPU velocities buffer.
       */
-    DeviceBuffer<float> getVelocities();
+    DeviceBuffer<RVec> getVelocities();
  
      /*! \brief Copy velocities to the GPU memory.
       *
@@ -277,7 +277,7 @@ public:
       *
       *  \returns GPU force buffer.
       */
-    DeviceBuffer<float> getForces();
+    DeviceBuffer<RVec> getForces();
  
      /*! \brief Copy forces to the GPU memory.
       *
@@ -395,21 +395,21 @@ private:
      int numAtomsAll_ = -1;
  
      //! Device positions buffer
-    DeviceBuffer<float> d_x_;
+    DeviceBuffer<RVec> d_x_;
      //! Number of particles saved in the positions buffer
      int d_xSize_ = -1;
      //! Allocation size for the positions buffer
      int d_xCapacity_ = -1;
  
      //! Device velocities buffer
-    DeviceBuffer<float> d_v_;
+    DeviceBuffer<RVec> d_v_;
      //! Number of particles saved in the velocities buffer
      int d_vSize_ = -1;
      //! Allocation size for the velocities buffer
      int d_vCapacity_ = -1;
  
      //! Device force buffer
-    DeviceBuffer<float> d_f_;
+    DeviceBuffer<RVec> d_f_;
      //! Number of particles saved in the force buffer
      int d_fSize_ = -1;
      //! Allocation size for the force buffer
@@ -428,7 +428,7 @@ private:
       *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
       *  \param[in]  commandStream  GPU stream to execute copy in.
       */
-    void copyToDevice(DeviceBuffer<float>            d_data,
+    void copyToDevice(DeviceBuffer<RVec>             d_data,
                        gmx::ArrayRef<const gmx::RVec> h_data,
                        int                            dataSize,
                        AtomLocality                   atomLocality,
@@ -443,7 +443,7 @@ private:
       *  \param[in]  commandStream  GPU stream to execute copy in.
       */
      void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                        DeviceBuffer<float>      d_data,
+                        DeviceBuffer<RVec>       d_data,
                          int                      dataSize,
                          AtomLocality             atomLocality,
                          CommandStream            commandStream);
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 031327e80d844b1917b4c65c5317e95aac721b56..4b385a5a7b56ca5b82a22c57bb648a127b6b731d 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -193,18 +193,18 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
          numAtomsPadded = numAtomsAll_;
      }
  
-    reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
  
      const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
      if (paddingAllocationSize > 0)
      {
          // The PME stream is used here because the padding region of d_x_ is only in the PME task.
-        clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_);
+        clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
      }
  
-    reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
      const int d_fOldCapacity = d_fCapacity_;
-    reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
      // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
      // the force accumulation stage before syncing with the local stream. Only done in CUDA,
      // since the force buffer ops are not implemented in OpenCL.
@@ -249,7 +249,7 @@ std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality
      return std::make_tuple(atomsStartAt, numAtomsToCopy);
  }
  
-void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                  d_data,
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
                                                  const gmx::ArrayRef<const gmx::RVec> h_data,
                                                  int                                  dataSize,
                                                  AtomLocality                         atomLocality,
@@ -269,18 +269,15 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
      int atomsStartAt, numAtomsToCopy;
      std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
  
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
      if (numAtomsToCopy != 0)
      {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                     "The device allocation is smaller than requested copy range.");
          GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                     "The host buffer is smaller than the requested copy range.");
  
-        copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
-                           elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
+                           atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
@@ -288,7 +285,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
  }
  
  void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
-                                                  DeviceBuffer<float>      d_data,
+                                                  DeviceBuffer<RVec>       d_data,
                                                    int                      dataSize,
                                                    AtomLocality             atomLocality,
                                                    CommandStream            commandStream)
@@ -307,25 +304,22 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
      int atomsStartAt, numAtomsToCopy;
      std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
  
-    int elementsStartAt   = atomsStartAt * DIM;
-    int numElementsToCopy = numAtomsToCopy * DIM;
-
      if (numAtomsToCopy != 0)
      {
-        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
                     "The device allocation is smaller than requested copy range.");
          GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
                     "The host buffer is smaller than the requested copy range.");
  
-        copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
-                             elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+        copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
+                             atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
      }
  
      wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
  {
      return d_x_;
  }
@@ -422,7 +416,7 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomL
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
  {
      return d_v_;
  }
@@ -476,7 +470,7 @@ void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLo
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
  {
      return d_f_;
  }
@@ -595,7 +589,7 @@ std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
  {
      return impl_->getCoordinates();
  }
@@ -635,7 +629,7 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocalit
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
  {
      return impl_->getVelocities();
  }
@@ -662,7 +656,7 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality
  }
  
  
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
  {
      return impl_->getForces();
  }
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 89d2e762b8f6dca8107daeb64c373333585aec7d..cd5e346303471796d065a316991b7bbfe1bb59a1 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1075,7 +1075,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                      const gmx::AtomLocality locality,
                                      bool                    fillLocal,
                                      NbnxmGpu*               gpu_nbv,
-                                    DeviceBuffer<float>     d_x,
+                                    DeviceBuffer<RVec>      d_x,
                                      GpuEventSynchronizer*   xReadyOnDevice)
  {
  
@@ -1459,7 +1459,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons
  
  /* Add the force array(s) from nbnxn_atomdata_t to f */
  void reduceForcesGpu(const gmx::AtomLocality                    locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<RVec>                         totalForcesDevice,
                       const Nbnxm::GridSet&                      gridSet,
                       void*                                      pmeForcesDevice,
                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index ceb87f71f093f5a99e4d86005f3c770a952cb60c..22d115706e0a394f3f4f55e95676ff07675548bc 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -376,12 +376,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet,
   * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
   * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
   */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
-                                    gmx::AtomLocality     locality,
-                                    bool                  fillLocal,
-                                    NbnxmGpu*             gpu_nbv,
-                                    DeviceBuffer<float>   d_x,
-                                    GpuEventSynchronizer* xReadyOnDevice);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
+                                    gmx::AtomLocality       locality,
+                                    bool                    fillLocal,
+                                    NbnxmGpu*               gpu_nbv,
+                                    DeviceBuffer<gmx::RVec> d_x,
+                                    GpuEventSynchronizer*   xReadyOnDevice);
  
  /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
   *
@@ -404,7 +404,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnx
   * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
   */
  void reduceForcesGpu(gmx::AtomLocality                          locality,
-                     DeviceBuffer<float>                        totalForcesDevice,
+                     DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                       const Nbnxm::GridSet&                      gridSet,
                       void*                                      pmeForcesDevice,
                       gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 94e99879db0e3c54976e19a24cf030885143ca7e..2defa174d647f2a34f4fad32acba32523df1615e 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -818,7 +818,7 @@ void cuda_set_cacheconfig()
  void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                             bool                      setFillerCoords,
                             NbnxmGpu*                 nb,
-                           DeviceBuffer<float>       d_x,
+                           DeviceBuffer<gmx::RVec>   d_x,
                             GpuEventSynchronizer*     xReadyOnDevice,
                             const Nbnxm::AtomLocality locality,
                             int                       gridId,
@@ -862,12 +862,13 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
                                          : nbnxn_gpu_x_to_nbat_x_kernel<false>;
          float4*    d_xq          = adat->xq;
+        float3*    d_xFloat3     = asFloat3(d_x);
          const int* d_atomIndices = nb->atomIndices;
          const int* d_cxy_na      = &nb->cxy_na[numColumnsMax * gridId];
          const int* d_cxy_ind     = &nb->cxy_ind[numColumnsMax * gridId];
-        const auto kernelArgs =
-                prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_x, &d_atomIndices,
-                                          &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
+        const auto kernelArgs    = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq,
+                                                          &d_xFloat3, &d_atomIndices, &d_cxy_na,
+                                                          &d_cxy_ind, &cellOffset, &numAtomsPerCell);
          launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
      }
  
@@ -884,7 +885,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
   *       forces only after Local stream already done so.
   */
  void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
-                               DeviceBuffer<float>                        totalForcesDevice,
+                               DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                 NbnxmGpu*                                  nb,
                                 void*                                      pmeForcesDevice,
                                 gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
@@ -935,8 +936,8 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      }
  
      const float3* d_fNB    = adat->f;
-    const float3* d_fPme   = (float3*)pmeForcesDevice;
-    float3*       d_fTotal = (float3*)totalForcesDevice;
+    const float3* d_fPme   = static_cast<float3*>(pmeForcesDevice);
+    float3*       d_fTotal = asFloat3(totalForcesDevice);
      const int*    d_cell   = nb->cell;
  
      const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 07b24fc923e0a94de18b885412f52957879677e0..ab79a8cf5b8482d453348d108e48e59fa07a77e5 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -140,7 +140,7 @@ void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality        local
  
  void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                 const bool              fillLocal,
-                                               DeviceBuffer<float>     d_x,
+                                               DeviceBuffer<gmx::RVec> d_x,
                                                 GpuEventSynchronizer*   xReadyOnDevice)
  {
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -178,7 +178,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
  }
  
  void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
-                                                      DeviceBuffer<float>     totalForcesDevice,
+                                                      DeviceBuffer<gmx::RVec> totalForcesDevice,
                                                        void*                   forcesPmeDevice,
                                                        gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                                        bool useGpuFPmeReduction,
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 50fed0fe5bac947273c3369653604aba990c25b9..2fa353a8486a5aaa8f8081dd9969f373eb7e2abf 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -286,10 +286,10 @@ public:
       * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
       * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
       */
-    void convertCoordinatesGpu(gmx::AtomLocality     locality,
-                               bool                  fillLocal,
-                               DeviceBuffer<float>   d_x,
-                               GpuEventSynchronizer* xReadyOnDevice);
+    void convertCoordinatesGpu(gmx::AtomLocality       locality,
+                               bool                    fillLocal,
+                               DeviceBuffer<gmx::RVec> d_x,
+                               GpuEventSynchronizer*   xReadyOnDevice);
  
      //! Init for GPU version of setup coordinates in Nbnxm
      void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -349,7 +349,7 @@ public:
       * \param [in]     accumulateForce      If the total force buffer already contains data
       */
      void atomdata_add_nbat_f_to_f_gpu(gmx::AtomLocality                          locality,
-                                      DeviceBuffer<float>                        totalForcesDevice,
+                                      DeviceBuffer<gmx::RVec>                    totalForcesDevice,
                                        void*                                      forcesPmeDevice,
                                        gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                        bool useGpuFPmeReduction,
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 7b9e4b80f9f1df63ebab8cc59a0fdb7ba983244f..2370fc836bf352f5f31e984556ae6848e6ae7f7b 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -242,8 +242,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                             bool gmx_unused setFillerCoords,
-                           NbnxmGpu gmx_unused* gpu_nbv,
-                           DeviceBuffer<float> gmx_unused d_x,
+                           NbnxmGpu gmx_unused*    gpu_nbv,
+                           DeviceBuffer<gmx::RVec> gmx_unused d_x,
                             GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                             gmx::AtomLocality gmx_unused locality,
                             int gmx_unused gridId,
@@ -312,7 +312,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
   */
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
-                               DeviceBuffer<float> gmx_unused totalForcesDevice,
+                               DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
                                 NbnxmGpu gmx_unused* gpu_nbv,
                                 void gmx_unused*                           pmeForcesDevice,
                                 gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
author	Artem Zhmurov <zhmurov@gmail.com>
	Mon, 20 Jan 2020 17:32:46 +0000 (18:32 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Thu, 6 Feb 2020 07:49:39 +0000 (08:49 +0100)
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_calculate_splines.cuh		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types.h		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/tests/CMakeLists.txt		patch \| blob \| history
src/gromacs/gpu_utils/tests/gpu_utils.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/update_constrain_gpu.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history