From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 11 May 2020 08:31:34 +0000 (+0000)
Subject: Unify CUDA and OpenCL lookup-table creation
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=986b2bb150cd9e2d673e12ebfec2af4afb678069;p=alexxy%2Fgromacs.git

Unify CUDA and OpenCL lookup-table creation

In CUDA code, textures are used for the lookup-tables,
whereas in OpenCL they are created as a read-only
buffers. This commit hides these differences behind a
unified wrapper.

Refs #3318
Refs #3311

Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711
---

diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index f1eac5a9d1..40d72761ba 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -101,7 +101,8 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
 /*! \brief
  * Performs the host-to-device data copy, synchronous or asynchronously on request.
  *
- * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ * Note that synchronous copy will not synchronize the stream in case of zero \p numValues
+ * because of the early return.
  *
  * \tparam        ValueType            Raw value type of the \p buffer.
  * \param[in,out] buffer               Pointer to the device-side buffer
@@ -161,7 +162,8 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
 /*! \brief
  * Performs the device-to-host data copy, synchronous or asynchronously on request.
  *
- * TODO: This is meant to gradually replace cu/ocl_copy_d2h.
+ * Note that synchronous copy will not synchronize the stream in case of zero \p numValues
+ * because of the early return.
  *
  * \tparam        ValueType            Raw value type of the \p buffer.
  * \param[in,out] hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
@@ -183,6 +185,10 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                           GpuApiCallBehavior       transferKind,
                           CommandEvent*            timingEvent)
 {
+    if (numValues == 0)
+    {
+        return; // such calls are actually made with empty domains
+    }
     GMX_ASSERT(buffer, "needs a buffer pointer");
     GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
     cl_int       clError;
diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp
index f987ae00e7..726e4f2cff 100644
--- a/src/gromacs/gpu_utils/oclutils.cpp
+++ b/src/gromacs/gpu_utils/oclutils.cpp
@@ -53,116 +53,6 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 
-int ocl_copy_H2D(cl_mem             d_dest,
-                 const void*        h_src,
-                 size_t             offset,
-                 size_t             bytes,
-                 GpuApiCallBehavior transferKind,
-                 cl_command_queue   command_queue,
-                 cl_event*          copy_event)
-{
-    cl_int gmx_unused cl_error;
-
-    if (d_dest == nullptr || h_src == nullptr || bytes == 0)
-    {
-        return -1;
-    }
-
-    switch (transferKind)
-    {
-        case GpuApiCallBehavior::Async:
-            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src,
-                                            0, nullptr, copy_event);
-            break;
-
-        case GpuApiCallBehavior::Sync:
-            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0,
-                                            nullptr, copy_event);
-            break;
-
-        default: throw;
-    }
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
-
-    return 0;
-}
-
-/*! \brief Launches asynchronous host to device memory copy.
- *
- *  If copy_event is not nullptr, on return it will contain an event object
- *  identifying this particular host to device operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_H2D_async(cl_mem           d_dest,
-                       const void*      h_src,
-                       size_t           offset,
-                       size_t           bytes,
-                       cl_command_queue command_queue,
-                       cl_event*        copy_event)
-{
-    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
-}
-
-/*! \brief Launches synchronous host to device memory copy.
- */
-int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue)
-{
-    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, nullptr);
-}
-
-int ocl_copy_D2H(void*              h_dest,
-                 cl_mem             d_src,
-                 size_t             offset,
-                 size_t             bytes,
-                 GpuApiCallBehavior transferKind,
-                 cl_command_queue   command_queue,
-                 cl_event*          copy_event)
-{
-    cl_int gmx_unused cl_error;
-
-    if (h_dest == nullptr || d_src == nullptr || bytes == 0)
-    {
-        return -1;
-    }
-
-    switch (transferKind)
-    {
-        case GpuApiCallBehavior::Async:
-            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0,
-                                           nullptr, copy_event);
-            break;
-
-        case GpuApiCallBehavior::Sync:
-            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0,
-                                           nullptr, copy_event);
-            break;
-
-        default: throw;
-    }
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
-
-
-    return 0;
-}
-
-/*! \brief Launches asynchronous device to host memory copy.
- *
- *  If copy_event is not nullptr, on return it will contain an event object
- *  identifying this particular host to device operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_D2H_async(void*            h_dest,
-                       cl_mem           d_src,
-                       size_t           offset,
-                       size_t           bytes,
-                       cl_command_queue command_queue,
-                       cl_event*        copy_event)
-{
-    return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
-}
-
 /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
  *
  *  \todo
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index ee445047fa..333147d78f 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -69,54 +69,6 @@ struct gmx_device_runtime_data_t
     cl_program program;
 };
 
-/*! \brief Launches synchronous or asynchronous device to host memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular device to host operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_D2H(void*              h_dest,
-                 cl_mem             d_src,
-                 size_t             offset,
-                 size_t             bytes,
-                 GpuApiCallBehavior transferKind,
-                 cl_command_queue   command_queue,
-                 cl_event*          copy_event);
-
-
-/*! \brief Launches asynchronous device to host memory copy. */
-int ocl_copy_D2H_async(void*            h_dest,
-                       cl_mem           d_src,
-                       size_t           offset,
-                       size_t           bytes,
-                       cl_command_queue command_queue,
-                       cl_event*        copy_event);
-
-/*! \brief Launches synchronous or asynchronous host to device memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular host to device operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_H2D(cl_mem             d_dest,
-                 const void*        h_src,
-                 size_t             offset,
-                 size_t             bytes,
-                 GpuApiCallBehavior transferKind,
-                 cl_command_queue   command_queue,
-                 cl_event*          copy_event);
-
-/*! \brief Launches asynchronous host to device memory copy. */
-int ocl_copy_H2D_async(cl_mem           d_dest,
-                       const void*      h_src,
-                       size_t           offset,
-                       size_t           bytes,
-                       cl_command_queue command_queue,
-                       cl_event*        copy_event);
-
-/*! \brief Launches synchronous host to device memory copy. */
-int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue);
-
 /*! \brief Allocate host memory in malloc style */
 void pmalloc(void** h_ptr, size_t nbytes);
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index a35a188400..b5018808f0 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -526,8 +526,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     }
 
     /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
-                       adat_len * sizeof(float) * 4, deviceStream.stream(),
+    GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
+               "The size of the xyzq buffer element should be equal to the size of float4.");
+    copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
+                       deviceStream, GpuApiCallBehavior::Async,
                        bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
@@ -895,10 +897,11 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     }
 
     /* DtoH f */
-    ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
-                       adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
-                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
-                       bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+    GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+               "The size of the force buffer element should be equal to the size of float3.");
+    copyFromDeviceBuffer(&nbatom->out[0].f.data()[adat_begin * DIM], &adat->f, adat_begin * DIM,
+                         adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
+                         bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
 
     /* kick off work */
     cl_error = clFlush(deviceStream.stream());
@@ -922,19 +925,25 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
-                               SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
-                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+            GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
+                       "Sizes of host- and device-side shift vectors should be the same.");
+            copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
+                                 SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
+                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
-                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
-                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+            GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+                       "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
+                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+            GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+                       "Sizes of host- and device-side electrostatic energy terms should be the "
+                       "same.");
+            copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
+                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
     }
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index 4f1e9fc65c..e0c25b2d73 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -677,14 +677,16 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 //! This function is documented in the header file
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
-    cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
+    cl_atomdata_t*      adat         = nb->atdat;
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
     {
-        ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
-                           SHIFTS * sizeof(nbatom->shift_vec[0]), ls, nullptr);
+        GMX_ASSERT(sizeof(float) * DIM == sizeof(*nbatom->shift_vec.data()),
+                   "Sizes of host- and device-side shift vectors should be the same.");
+        copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast<const float*>(nbatom->shift_vec.data()),
+                           0, SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async, nullptr);
         adat->bShiftVecUploaded = CL_TRUE;
     }
 }
@@ -766,13 +768,18 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwtype))
     {
-        ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2),
-                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+        GMX_ASSERT(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
+                   "Size of the LJ parameters element should be equal to the size of float2.");
+        copyToDeviceBuffer(&d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, 2 * natoms,
+                           deviceStream, GpuApiCallBehavior::Async,
+                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
     else
     {
-        ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int),
-                           deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+        GMX_ASSERT(sizeof(int) == sizeof(*nbat->params().type.data()),
+                   "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, deviceStream,
+                           GpuApiCallBehavior::Async, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
 
     if (bDoTime)
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index cda2294783..f447ce9d48 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -164,28 +164,28 @@ typedef struct cl_atomdata
     int nalloc;
 
     //! float4 buffer with atom coordinates + charges, size natoms
-    cl_mem xq;
+    DeviceBuffer<float> xq;
 
     //! float3 buffer with force output array, size natoms
-    cl_mem f;
+    DeviceBuffer<float> f;
 
     //! LJ energy output, size 1
-    cl_mem e_lj;
+    DeviceBuffer<float> e_lj;
     //! Electrostatics energy input, size 1
-    cl_mem e_el;
+    DeviceBuffer<float> e_el;
 
     //! float3 buffer with shift forces
-    cl_mem fshift;
+    DeviceBuffer<float> fshift;
 
     //! number of atom types
     int ntypes;
     //! int buffer with atom type indices, size natoms
-    cl_mem atom_types;
+    DeviceBuffer<int> atom_types;
     //! float2 buffer with sqrt(c6),sqrt(c12), size natoms
-    cl_mem lj_comb;
+    DeviceBuffer<float> lj_comb;
 
     //! float3 buffer with shifts values
-    cl_mem shift_vec;
+    DeviceBuffer<float> shift_vec;
 
     //! true if the shift vector has been uploaded
     bool bShiftVecUploaded;