Unify CUDA and OpenCL lookup-table creation

author Artem Zhmurov <zhmurov@gmail.com>

Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt

index e8c02ccf31e2f28e4937f6a170ef5a2017d108cd..2ad27010cfe79da12d86b9e4220f28bca2ec4889 100644 (file)
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -54,7 +54,6 @@ if(GMX_USE_OPENCL)
          )
  elseif(GMX_USE_CUDA)
      gmx_add_libgromacs_sources(
-        cudautils.cu
          device_stream.cu
          gpu_utils.cu
          pinning.cu
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index 02eec85bbdd5a94ba799cf6b9467a1992d0a6364..2322dd87930a328e41b01950c22eebe129de244b 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -142,30 +142,6 @@ enum class GpuApiCallBehavior;
  
  #endif /* CHECK_CUDA_ERRORS */
  
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_D2H(void* h_dest, void* d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy in stream 0. */
-int cu_copy_D2H_sync(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_D2H_async(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_H2D(void* d_dest, const void* h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy. */
-int cu_copy_H2D_sync(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_H2D_async(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
  // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
  // GPU table object. There is also almost self-contained fetchFromParamLookupTable()
  // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file.
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

index c9bce141fef3d338a9f19238b77db36c52aed7d2..3aabf4285b67c5180a7d51ede8173e5ae2825f9f 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -91,8 +91,6 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
  /*! \brief
   * Performs the host-to-device data copy, synchronous or asynchronously on request.
   *
- * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
- *
   * \tparam        ValueType            Raw value type of the \p buffer.
   * \param[in,out] buffer               Pointer to the device-side buffer
   * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
@@ -114,7 +112,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
  {
      if (numValues == 0)
      {
-        return; // such calls are actually made with empty domains
+        return;
      }
      GMX_ASSERT(buffer, "needs a buffer pointer");
      GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
@@ -144,8 +142,6 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
  /*! \brief
   * Performs the device-to-host data copy, synchronous or asynchronously on request.
   *
- * TODO: This is meant to gradually replace cu/ocl_copy_d2h.
- *
   * \tparam        ValueType            Raw value type of the \p buffer.
   * \param[in,out] hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
   * \param[in]     buffer               Pointer to the device-side buffer
@@ -165,6 +161,10 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                            GpuApiCallBehavior       transferKind,
                            CommandEvent* /*timingEvent*/)
  {
+    if (numValues == 0)
+    {
+        return;
+    }
      GMX_ASSERT(buffer, "needs a buffer pointer");
      GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 95aca5ba1d5dec5cf51faca94fa3ba4909791de4..8c76c4e95f35975a5425c9b2e943898b81da4ec3 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -54,7 +54,7 @@
  
  #include "nbnxm_cuda.h"
  
-#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/typecasts.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -444,9 +444,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
          t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
      }
  
-    cu_copy_H2D_async(adat->xq + adat_begin,
-                      static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
-                      adat_len * sizeof(*adat->xq), deviceStream.stream());
+    static_assert(sizeof(adat->xq[0]) == sizeof(float4),
+                  "The size of the xyzq buffer element should be equal to the size of float4.");
+    copyToDeviceBuffer(&adat->xq, reinterpret_cast<const float4*>(nbatom->x().data()) + adat_begin,
+                       adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
  
      if (bDoTime)
      {
@@ -761,8 +762,11 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
       */
      if (!stepWork.useGpuFBufferOps)
      {
-        cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
-                          (adat_len) * sizeof(*adat->f), deviceStream.stream());
+        static_assert(
+                sizeof(adat->f[0]) == sizeof(float3),
+                "The size of the force buffer element should be equal to the size of float3.");
+        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adat_begin, &adat->f,
+                             adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
      }
  
      /* After the non-local D2H is launched the nonlocal_done event can be
@@ -781,15 +785,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
          /* DtoH fshift when virial is needed */
          if (stepWork.computeVirial)
          {
-            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
-                              deviceStream.stream());
+            static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]),
+                          "Sizes of host- and device-side shift vectors should be the same.");
+            copyFromDeviceBuffer(nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
          }
  
          /* DtoH energies */
          if (stepWork.computeEnergy)
          {
-            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
-            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
+            static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]),
+                          "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
+            static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]),
+                          "Sizes of host- and device-side electrostatic energy terms should be the "
+                          "same.");
+            copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
          }
      }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index d41ee7d2acb58129e1b1ba106557eb3889147829..c6c7850493e3b5c4e2d76220bfd5213b167ce9b3 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -572,13 +572,16 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
-    cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
+    cu_atomdata_t*      adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
      {
-        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), SHIFTS * sizeof(*adat->shift_vec), ls);
+        static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
+                      "Sizes of host- and device-side shift vectors should be the same.");
+        copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr);
          adat->bShiftVecUploaded = true;
      }
  }
@@ -625,10 +628,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      cudaError_t         stat;
      int                 nalloc, natoms;
      bool                realloced;
-    bool                bDoTime      = nb->bDoTime;
-    cu_timers_t*        timers       = nb->timers;
-    cu_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
+    bool                bDoTime     = nb->bDoTime;
+    cu_timers_t*        timers      = nb->timers;
+    cu_atomdata_t*      d_atdat     = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -636,7 +639,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (bDoTime)
      {
          /* time async copy */
-        timers->atdat.openTimingRegion(deviceStream);
+        timers->atdat.openTimingRegion(localStream);
      }
  
      /* need to reallocate if we have to copy more atoms than the amount of space
@@ -684,18 +687,23 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  
      if (useLjCombRule(nb->nbparam))
      {
-        cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
-                          natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
+        static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2),
+                      "Size of the LJ parameters element should be equal to the size of float2.");
+        copyToDeviceBuffer(&d_atdat->lj_comb,
+                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()), 0,
+                           natoms, localStream, GpuApiCallBehavior::Async, nullptr);
      }
      else
      {
-        cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
-                          natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
+        static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]),
+                      "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, localStream,
+                           GpuApiCallBehavior::Async, nullptr);
      }
  
      if (bDoTime)
      {
-        timers->atdat.closeTimingRegion(deviceStream);
+        timers->atdat.closeTimingRegion(localStream);
      }
  }
author	Artem Zhmurov <zhmurov@gmail.com>
	Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 4 May 2020 17:44:01 +0000 (17:44 +0000)
src/gromacs/gpu_utils/CMakeLists.txt		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history