From 5f8899ba4bca7b8d158242e1b426fc7f9fcbe787 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 4 May 2020 17:44:01 +0000
Subject: [PATCH] Unify CUDA and OpenCL lookup-table creation

In CUDA code, textures are used for the lookup-tables,
whereas in OpenCL they are created as a read-only
buffers. This commit hides these differences behind a
unified wrapper.

Refs #3318
Refs #3311

Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711
---
 src/gromacs/gpu_utils/CMakeLists.txt          |  1 -
 src/gromacs/gpu_utils/cudautils.cuh           | 24 -------------
 src/gromacs/gpu_utils/devicebuffer.cuh        | 10 +++---
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          | 33 ++++++++++++------
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        | 34 ++++++++++++-------
 5 files changed, 49 insertions(+), 53 deletions(-)
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt
index e8c02ccf31..2ad27010cf 100644
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -54,7 +54,6 @@ if(GMX_USE_OPENCL)
         )
 elseif(GMX_USE_CUDA)
     gmx_add_libgromacs_sources(
-        cudautils.cu
         device_stream.cu
         gpu_utils.cu
         pinning.cu
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 02eec85bbd..2322dd8793 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -142,30 +142,6 @@ enum class GpuApiCallBehavior;
 
 #endif /* CHECK_CUDA_ERRORS */
 
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_D2H(void* h_dest, void* d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy in stream 0. */
-int cu_copy_D2H_sync(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_D2H_async(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_H2D(void* d_dest, const void* h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy. */
-int cu_copy_H2D_sync(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_H2D_async(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
 // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
 // GPU table object. There is also almost self-contained fetchFromParamLookupTable()
 // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file.
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index c9bce141fe..3aabf4285b 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -91,8 +91,6 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
 /*! \brief
  * Performs the host-to-device data copy, synchronous or asynchronously on request.
  *
- * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
- *
  * \tparam        ValueType            Raw value type of the \p buffer.
  * \param[in,out] buffer               Pointer to the device-side buffer
  * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
@@ -114,7 +112,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
 {
     if (numValues == 0)
     {
-        return; // such calls are actually made with empty domains
+        return;
     }
     GMX_ASSERT(buffer, "needs a buffer pointer");
     GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
@@ -144,8 +142,6 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
 /*! \brief
  * Performs the device-to-host data copy, synchronous or asynchronously on request.
  *
- * TODO: This is meant to gradually replace cu/ocl_copy_d2h.
- *
  * \tparam        ValueType            Raw value type of the \p buffer.
  * \param[in,out] hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
  * \param[in]     buffer               Pointer to the device-side buffer
@@ -165,6 +161,10 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                           GpuApiCallBehavior       transferKind,
                           CommandEvent* /*timingEvent*/)
 {
+    if (numValues == 0)
+    {
+        return;
+    }
     GMX_ASSERT(buffer, "needs a buffer pointer");
     GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 95aca5ba1d..8c76c4e95f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -54,7 +54,7 @@
 
 #include "nbnxm_cuda.h"
 
-#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -444,9 +444,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
-    cu_copy_H2D_async(adat->xq + adat_begin,
-                      static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
-                      adat_len * sizeof(*adat->xq), deviceStream.stream());
+    static_assert(sizeof(adat->xq[0]) == sizeof(float4),
+                  "The size of the xyzq buffer element should be equal to the size of float4.");
+    copyToDeviceBuffer(&adat->xq, reinterpret_cast<const float4*>(nbatom->x().data()) + adat_begin,
+                       adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
 
     if (bDoTime)
     {
@@ -761,8 +762,11 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      */
     if (!stepWork.useGpuFBufferOps)
     {
-        cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
-                          (adat_len) * sizeof(*adat->f), deviceStream.stream());
+        static_assert(
+                sizeof(adat->f[0]) == sizeof(float3),
+                "The size of the force buffer element should be equal to the size of float3.");
+        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adat_begin, &adat->f,
+                             adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
     }
 
     /* After the non-local D2H is launched the nonlocal_done event can be
@@ -781,15 +785,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
-                              deviceStream.stream());
+            static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]),
+                          "Sizes of host- and device-side shift vectors should be the same.");
+            copyFromDeviceBuffer(nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
-            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
+            static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]),
+                          "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
+            static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]),
+                          "Sizes of host- and device-side electrostatic energy terms should be the "
+                          "same.");
+            copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream,
+                                 GpuApiCallBehavior::Async, nullptr);
         }
     }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index d41ee7d2ac..c6c7850493 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -572,13 +572,16 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
-    cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
+    cu_atomdata_t*      adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
     {
-        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), SHIFTS * sizeof(*adat->shift_vec), ls);
+        static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
+                      "Sizes of host- and device-side shift vectors should be the same.");
+        copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr);
         adat->bShiftVecUploaded = true;
     }
 }
@@ -625,10 +628,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     cudaError_t         stat;
     int                 nalloc, natoms;
     bool                realloced;
-    bool                bDoTime      = nb->bDoTime;
-    cu_timers_t*        timers       = nb->timers;
-    cu_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
+    bool                bDoTime     = nb->bDoTime;
+    cu_timers_t*        timers      = nb->timers;
+    cu_atomdata_t*      d_atdat     = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -636,7 +639,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     if (bDoTime)
     {
         /* time async copy */
-        timers->atdat.openTimingRegion(deviceStream);
+        timers->atdat.openTimingRegion(localStream);
     }
 
     /* need to reallocate if we have to copy more atoms than the amount of space
@@ -684,18 +687,23 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam))
     {
-        cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
-                          natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
+        static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2),
+                      "Size of the LJ parameters element should be equal to the size of float2.");
+        copyToDeviceBuffer(&d_atdat->lj_comb,
+                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()), 0,
+                           natoms, localStream, GpuApiCallBehavior::Async, nullptr);
     }
     else
     {
-        cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
-                          natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
+        static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]),
+                      "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, localStream,
+                           GpuApiCallBehavior::Async, nullptr);
     }
 
     if (bDoTime)
     {
-        timers->atdat.closeTimingRegion(deviceStream);
+        timers->atdat.closeTimingRegion(localStream);
     }
 }
 
-- 
2.22.0