Unify CUDA and OpenCL lookup-table creation
authorArtem Zhmurov <zhmurov@gmail.com>
Thu, 16 Apr 2020 11:38:32 +0000 (11:38 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 16 Apr 2020 11:38:32 +0000 (11:38 +0000)
In CUDA code, textures are used for the lookup-tables,
whereas in OpenCL they are created as a read-only
buffers. This commit hides these differences behind a
unified wrapper.

Refs #3318
Refs #3311

Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711

src/gromacs/ewald/pme.cuh
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_types.h
src/gromacs/gpu_utils/cudautils.cu
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/gpu_utils/devicebuffer.cuh
src/gromacs/gpu_utils/devicebuffer_ocl.h
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index f30a975a229a9964b399a2178087ea9fe1b6af53..af0e258ae98fff341a239bfa03446f9a44f67ff5 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 #include "pme_gpu_types.h"
 
 /*! \brief \internal
- * A single structure encompassing all the PME data used in CUDA kernels.
- * This inherits from PmeGpuKernelParamsBase and adds a couple cudaTextureObject_t handles,
- * which we would like to avoid in plain C++.
+ * An alias for PME parameters in CUDA.
+ * \todo Remove if we decide to unify CUDA and OpenCL
  */
 struct PmeGpuCudaKernelParams : PmeGpuKernelParamsBase
 {
-    /* These are CUDA texture objects, related to the grid size. */
-    /*! \brief CUDA texture object for accessing grid.d_fractShiftsTable */
-    cudaTextureObject_t fractShiftsTableTexture;
-    /*! \brief CUDA texture object for accessing grid.d_gridlineIndicesTable */
-    cudaTextureObject_t gridlineIndicesTableTexture;
+    // Place CUDA-specific stuff here
 };
 
 #endif
index 62f7cc5c7e26a1d7fa29fe99f5b164ca712b1d30..63b77aa86f81326aee15640c164f02ee64722ab1 100644 (file)
@@ -367,35 +367,22 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
 
     const int newFractShiftsSize = cellCount * (nx + ny + nz);
 
-#if GMX_GPU == GMX_GPU_CUDA
-    initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable, kernelParamsPtr->fractShiftsTableTexture,
-                         pmeGpu->common->fsh.data(), newFractShiftsSize);
+    initParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
+                         &kernelParamsPtr->fractShiftsTableTexture, pmeGpu->common->fsh.data(),
+                         newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
 
-    initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
-                         kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
-                         newFractShiftsSize);
-#elif GMX_GPU == GMX_GPU_OPENCL
-    // No dedicated texture routines....
-    allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->deviceContext_);
-    allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->deviceContext_);
-    copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
-                       GpuApiCallBehavior::Async, nullptr);
-    copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
-                       newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
-                       GpuApiCallBehavior::Async, nullptr);
-#endif
+    initParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
+                         &kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
+                         newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
 }
 
 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
 {
     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
 #if GMX_GPU == GMX_GPU_CUDA
-    destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+    destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
                             kernelParamsPtr->fractShiftsTableTexture);
-    destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+    destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
                             kernelParamsPtr->gridlineIndicesTableTexture);
 #elif GMX_GPU == GMX_GPU_OPENCL
     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
index 5ddd79fb5a39618557d717fbd48c7bc3ae33dc94..97f84164996cd8c5ea685e26f790250194ef5129 100644 (file)
@@ -199,11 +199,9 @@ struct PmeGpuDynamicParams
 };
 
 /*! \internal \brief
- * A single structure encompassing almost all the PME data used in GPU kernels on device.
- * This is inherited by the GPU framework-specific structure
- * (PmeGpuCudaKernelParams in pme.cuh).
- * This way, most code preparing the kernel parameters can be GPU-agnostic by casting
- * the kernel parameter data pointer to PmeGpuKernelParamsBase.
+ * A single structure encompassing all the PME data used in GPU kernels on device.
+ * To extend the list with platform-specific parameters, this can be inherited by the
+ * GPU framework-specific structure.
  */
 struct PmeGpuKernelParamsBase
 {
@@ -218,6 +216,11 @@ struct PmeGpuKernelParamsBase
      * before launching spreading.
      */
     struct PmeGpuDynamicParams current;
+    /* These texture objects are only used in CUDA and are related to the grid size. */
+    /*! \brief Texture object for accessing grid.d_fractShiftsTable */
+    HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
+    /*! \brief Texture object for accessing grid.d_gridlineIndicesTable */
+    HIDE_FROM_OPENCL_COMPILER(DeviceTexture) gridlineIndicesTableTexture;
 };
 
 #endif
index 1442bf47c2298462be116a3ec7ab2ef0d4de138a..83ee02e3b5d71583b5b0853f94e55ab06330d024 100644 (file)
@@ -130,67 +130,3 @@ int cu_copy_H2D_async(void* d_dest, const void* h_src, size_t bytes, cudaStream_
 {
     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 }
-
-/*! \brief Set up texture object for an array of type T.
- *
- * Set up texture object for an array of type T and bind it to the device memory
- * \p d_ptr points to.
- *
- * \tparam[in] T        Raw data type
- * \param[out] texObj   texture object to initialize
- * \param[in]  d_ptr    pointer to device global memory to bind \p texObj to
- * \param[in]  sizeInBytes  size of memory area to bind \p texObj to
- */
-template<typename T>
-static void setup1DTexture(cudaTextureObject_t& texObj, void* d_ptr, size_t sizeInBytes)
-{
-    assert(!c_disableCudaTextures);
-
-    cudaError_t      stat;
-    cudaResourceDesc rd;
-    cudaTextureDesc  td;
-
-    memset(&rd, 0, sizeof(rd));
-    rd.resType                = cudaResourceTypeLinear;
-    rd.res.linear.devPtr      = d_ptr;
-    rd.res.linear.desc        = cudaCreateChannelDesc<T>();
-    rd.res.linear.sizeInBytes = sizeInBytes;
-
-    memset(&td, 0, sizeof(td));
-    td.readMode = cudaReadModeElementType;
-    stat        = cudaCreateTextureObject(&texObj, &rd, &td, nullptr);
-    CU_RET_ERR(stat, "cudaCreateTextureObject failed");
-}
-
-template<typename T>
-void initParamLookupTable(T*& d_ptr, cudaTextureObject_t& texObj, const T* h_ptr, int numElem)
-{
-    const size_t sizeInBytes = numElem * sizeof(*d_ptr);
-    cudaError_t  stat        = cudaMalloc((void**)&d_ptr, sizeInBytes);
-    CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
-    cu_copy_H2D_sync(d_ptr, (void*)h_ptr, sizeInBytes);
-
-    if (!c_disableCudaTextures)
-    {
-        setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
-    }
-}
-
-template<typename T>
-void destroyParamLookupTable(T* d_ptr, cudaTextureObject_t texObj)
-{
-    if (!c_disableCudaTextures && texObj)
-    {
-        CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
-    }
-    CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
-}
-
-/*! \brief Add explicit instantiations of init/destroyParamLookupTable() here as needed.
- * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
- * looks reasonable, when instantiating the templates for new types - just in case.
- */
-template void initParamLookupTable<float>(float*&, cudaTextureObject_t&, const float*, int);
-template void destroyParamLookupTable<float>(float*, cudaTextureObject_t);
-template void initParamLookupTable<int>(int*&, cudaTextureObject_t&, const int*, int);
-template void destroyParamLookupTable<int>(int*, cudaTextureObject_t);
index ff07d174c220323140f07c09152d02051bbd79cb..02eec85bbdd5a94ba799cf6b9467a1992d0a6364 100644 (file)
@@ -170,41 +170,6 @@ int cu_copy_H2D_async(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/,
 // GPU table object. There is also almost self-contained fetchFromParamLookupTable()
 // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file.
 
-/*! \brief Initialize parameter lookup table.
- *
- * Initializes device memory, copies data from host and binds
- * a texture to allocated device memory to be used for parameter lookup.
- *
- * \tparam[in] T         Raw data type
- * \param[out] d_ptr     device pointer to the memory to be allocated
- * \param[out] texObj    texture object to be initialized
- * \param[in]  h_ptr     pointer to the host memory to be uploaded to the device
- * \param[in]  numElem   number of elements in the h_ptr
- */
-template<typename T>
-void initParamLookupTable(T*& d_ptr, cudaTextureObject_t& texObj, const T* h_ptr, int numElem);
-
-// Add extern declarations so each translation unit understands that
-// there will be a definition provided.
-extern template void initParamLookupTable<int>(int*&, cudaTextureObject_t&, const int*, int);
-extern template void initParamLookupTable<float>(float*&, cudaTextureObject_t&, const float*, int);
-
-/*! \brief Destroy parameter lookup table.
- *
- * Unbinds texture object, deallocates device memory.
- *
- * \tparam[in] T         Raw data type
- * \param[in]  d_ptr     Device pointer to the memory to be deallocated
- * \param[in]  texObj    Texture object to be deinitialized
- */
-template<typename T>
-void destroyParamLookupTable(T* d_ptr, cudaTextureObject_t texObj);
-
-// Add extern declarations so each translation unit understands that
-// there will be a definition provided.
-extern template void destroyParamLookupTable<int>(int*, cudaTextureObject_t);
-extern template void destroyParamLookupTable<float>(float*, cudaTextureObject_t);
-
 /*! \brief Add a triplets stored in a float3 to an rvec variable.
  *
  * \param[out]  a Rvec to increment
index d4bfe8c35b5b47fc6fd7c3f6190489b99fcc6abf..c9bce141fef3d338a9f19238b77db36c52aed7d2 100644 (file)
  *  \inlibraryapi
  */
 
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
 
 /*! \brief
  * Allocates a device-side buffer.
@@ -139,7 +141,6 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
     }
 }
 
-
 /*! \brief
  * Performs the device-to-host data copy, synchronous or asynchronously on request.
  *
@@ -231,4 +232,90 @@ static bool checkDeviceBuffer(DeviceBuffer<T> buffer, gmx_unused int requiredSiz
     return buffer != nullptr;
 }
 
+//! Device texture wrapper.
+using DeviceTexture = cudaTextureObject_t;
+
+/*! \brief Create a texture object for an array of type ValueType.
+ *
+ * Creates the device buffer, copies data and binds texture object for an array of type ValueType.
+ *
+ * \todo Test if using textures is still relevant on modern hardware.
+ *
+ * \tparam      ValueType      Raw data type.
+ *
+ * \param[out]  deviceBuffer   Device buffer to store data in.
+ * \param[out]  deviceTexture  Device texture object to initialize.
+ * \param[in]   hostBuffer     Host buffer to get date from
+ * \param[in]   numValues      Number of elements in the buffer.
+ * \param[in]   deviceContext  GPU device context.
+ */
+template<typename ValueType>
+void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
+                          DeviceTexture*           deviceTexture,
+                          const ValueType*         hostBuffer,
+                          int                      numValues,
+                          const DeviceContext&     deviceContext)
+{
+    if (numValues == 0)
+    {
+        return;
+    }
+    GMX_ASSERT(hostBuffer, "Host buffer should be specified.");
+
+    allocateDeviceBuffer(deviceBuffer, numValues, deviceContext);
+
+    const size_t sizeInBytes = numValues * sizeof(ValueType);
+
+    cudaError_t stat =
+            cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
+
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            gmx::formatString("Synchronous H2D copy failed (CUDA error: %s).", cudaGetErrorName(stat))
+                    .c_str());
+
+    if (!c_disableCudaTextures)
+    {
+        cudaResourceDesc rd;
+        cudaTextureDesc  td;
+
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                = cudaResourceTypeLinear;
+        rd.res.linear.devPtr      = *deviceBuffer;
+        rd.res.linear.desc        = cudaCreateChannelDesc<ValueType>();
+        rd.res.linear.sizeInBytes = sizeInBytes;
+
+        memset(&td, 0, sizeof(td));
+        td.readMode = cudaReadModeElementType;
+        stat        = cudaCreateTextureObject(deviceTexture, &rd, &td, nullptr);
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           gmx::formatString("cudaCreateTextureObject failed (CUDA error: %s).",
+                                             cudaGetErrorName(stat))
+                                   .c_str());
+    }
+}
+
+/*! \brief Unbind the texture and release the CUDA texture object.
+ *
+ * \tparam         ValueType      Raw data type
+ *
+ * \param[in,out]  deviceBuffer   Device buffer to store data in.
+ * \param[in,out]  deviceTexture  Device texture object to unbind.
+ */
+template<typename ValueType>
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& deviceTexture)
+{
+    if (!c_disableCudaTextures && deviceTexture && deviceBuffer)
+    {
+        cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
+        GMX_RELEASE_ASSERT(
+                stat == cudaSuccess,
+                gmx::formatString(
+                        "cudaDestroyTextureObject on texture object failed (CUDA error: %s).",
+                        cudaGetErrorName(stat))
+                        .c_str());
+    }
+    freeDeviceBuffer(deviceBuffer);
+}
+
 #endif
index a8aa77b04624f33e29ffa64a73008e18e67215d3..f1eac5a9d126e4d9cba5d540694985d64fdb1881 100644 (file)
@@ -270,6 +270,53 @@ static bool checkDeviceBuffer(DeviceBuffer<T> buffer, int requiredSize)
     return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
 }
 
+//! Device texture wrapper.
+using DeviceTexture = void*;
+
+/*! \brief Create a texture object for an array of type ValueType.
+ *
+ * Creates the device buffer and copies read-only data for an array of type ValueType.
+ *
+ * \todo Decide if using image2d is most efficient.
+ *
+ * \tparam      ValueType      Raw data type.
+ *
+ * \param[out]  deviceBuffer   Device buffer to store data in.
+ * \param[in]   hostBuffer     Host buffer to get date from.
+ * \param[in]   numValues      Number of elements in the buffer.
+ * \param[in]   deviceContext  GPU device context.
+ */
+template<typename ValueType>
+void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
+                          DeviceTexture* /* deviceTexture */,
+                          const ValueType*     hostBuffer,
+                          int                  numValues,
+                          const DeviceContext& deviceContext)
+{
+    GMX_ASSERT(hostBuffer, "Host buffer pointer can not be null");
+    const size_t bytes = numValues * sizeof(ValueType);
+    cl_int       clError;
+    *deviceBuffer = clCreateBuffer(deviceContext.context(),
+                                   CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   bytes, const_cast<ValueType*>(hostBuffer), &clError);
+
+    GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+                       gmx::formatString("Constant memory allocation failed (OpenCL error %d: %s)",
+                                         clError, ocl_get_error_string(clError).c_str())
+                               .c_str());
+}
+
+/*! \brief Release the OpenCL device buffer.
+ *
+ * \tparam        ValueType     Raw data type.
+ *
+ * \param[in,out] deviceBuffer  Device buffer to store data in.
+ */
+template<typename ValueType>
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& /* deviceTexture*/)
+{
+    freeDeviceBuffer(deviceBuffer);
+}
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
index 6579d4100497e1784d483c4e942920e10f506a0d..d41ee7d2acb58129e1b1ba106557eb3889147829 100644 (file)
@@ -111,7 +111,9 @@ static inline bool useLjCombRule(const cu_nbparam_t* nbparam)
     and the table GPU array. If called with an already allocated table,
     it just re-uploads the table.
  */
-static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, cu_nbparam_t* nbp)
+static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
+                                           cu_nbparam_t*                nbp,
+                                           const DeviceContext&         deviceContext)
 {
     if (nbp->coulomb_tab != nullptr)
     {
@@ -119,8 +121,8 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
     }
 
     nbp->coulomb_tab_scale = tables.scale;
-    initParamLookupTable(nbp->coulomb_tab, nbp->coulomb_tab_texobj, tables.tableF.data(),
-                         tables.tableF.size());
+    initParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(),
+                         tables.tableF.size(), deviceContext);
 }
 
 
@@ -233,7 +235,8 @@ static void set_cutoff_parameters(cu_nbparam_t* nbp, const interaction_const_t*
 static void init_nbparam(cu_nbparam_t*                   nbp,
                          const interaction_const_t*      ic,
                          const PairlistParams&           listParams,
-                         const nbnxn_atomdata_t::Params& nbatParams)
+                         const nbnxn_atomdata_t::Params& nbatParams,
+                         const DeviceContext&            deviceContext)
 {
     int ntypes;
 
@@ -319,19 +322,21 @@ static void init_nbparam(cu_nbparam_t*                   nbp,
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
         GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp);
+        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
     }
 
     /* set up LJ parameter lookup table */
     if (!useLjCombRule(nbp))
     {
-        initParamLookupTable(nbp->nbfp, nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * ntypes * ntypes);
+        initParamLookupTable(&nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(),
+                             2 * ntypes * ntypes, deviceContext);
     }
 
     /* set up LJ-PME parameter lookup table */
     if (ic->vdwtype == evdwPME)
     {
-        initParamLookupTable(nbp->nbfp_comb, nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * ntypes);
+        initParamLookupTable(&nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(),
+                             2 * ntypes, deviceContext);
     }
 }
 
@@ -343,6 +348,7 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti
     {
         return;
     }
+    NbnxmGpu*     nb  = nbv->gpu_nbv;
     cu_nbparam_t* nbp = nbv->gpu_nbv->nbparam;
 
     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
@@ -350,7 +356,7 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti
     nbp->eeltype = pick_ewald_kernel_type(*ic);
 
     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp);
+    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 }
 
 /*! Initializes the pair list data structure. */
@@ -407,7 +413,7 @@ static void cuda_init_const(NbnxmGpu*                       nb,
                             const nbnxn_atomdata_t::Params& nbatParams)
 {
     init_atomdata_first(nb->atdat, nbatParams.numTypes);
-    init_nbparam(nb->nbparam, ic, listParams, nbatParams);
+    init_nbparam(nb->nbparam, ic, listParams, nbatParams, *nb->deviceContext_);
 
     /* clear energy and shift force outputs */
     nbnxn_cuda_clear_e_fshift(nb);
@@ -697,7 +703,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam)
 {
     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
     }
 }
 
@@ -726,12 +732,12 @@ void gpu_free(NbnxmGpu* nb)
 
     if (!useLjCombRule(nb->nbparam))
     {
-        destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj);
+        destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
     }
 
     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
     {
-        destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+        destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
     }
 
     stat = cudaFree(atdat->shift_vec);
index bc913e0e24d6f2e45bf3e8b89c791f4aae1d7642..4f1e9fc65c75e737684dc293bdf7e3bc261a761c 100644 (file)
@@ -118,35 +118,16 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
                                            cl_nbparam_t*                nbp,
                                            const DeviceContext&         deviceContext)
 {
-    cl_mem coul_tab;
-
-    cl_int cl_error;
-
     if (nbp->coulomb_tab_climg2d != nullptr)
     {
         freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
     }
 
-    /* Switched from using textures to using buffers */
-    // TODO: decide which alternative is most efficient - textures or buffers.
-    /*
-       cl_image_format array_format;
-
-       array_format.image_channel_data_type = CL_FLOAT;
-       array_format.image_channel_order     = CL_R;
-
-       coul_tab = clCreateImage2D(deviceContext.context(), CL_MEM_READ_WRITE |
-       CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error);
-     */
+    DeviceBuffer<real> coulomb_tab;
 
-    coul_tab = clCreateBuffer(deviceContext.context(),
-                              CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-                              tables.tableF.size() * sizeof(cl_float),
-                              const_cast<real*>(tables.tableF.data()), &cl_error);
-    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+    initParamLookupTable(&coulomb_tab, nullptr, tables.tableF.data(), tables.tableF.size(), deviceContext);
 
-    nbp->coulomb_tab_climg2d = coul_tab;
+    nbp->coulomb_tab_climg2d = coulomb_tab;
     nbp->coulomb_tab_scale   = tables.scale;
 }
 
@@ -320,22 +301,7 @@ static void init_nbparam(cl_nbparam_t*                   nbp,
         init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
     }
     else
-    // TODO: improvement needed.
-    // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN
-    // because the OpenCL kernels don't accept nullptr values for image2D parameters.
     {
-        /* Switched from using textures to using buffers */
-        // TODO: decide which alternative is most efficient - textures or buffers.
-        /*
-           cl_image_format array_format;
-
-           array_format.image_channel_data_type = CL_FLOAT;
-           array_format.image_channel_order     = CL_R;
-
-           nbp->coulomb_tab_climg2d = clCreateImage2D(deviceContext.context(),
-           CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);
-         */
-
         nbp->coulomb_tab_climg2d = clCreateBuffer(deviceContext.context(), CL_MEM_READ_ONLY,
                                                   sizeof(cl_float), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -346,51 +312,16 @@ static void init_nbparam(cl_nbparam_t*                   nbp,
     const int nnbfp_comb = 2 * nbatParams.numTypes;
 
     {
-        /* Switched from using textures to using buffers */
-        // TODO: decide which alternative is most efficient - textures or buffers.
-        /*
-           cl_image_format array_format;
-
-           array_format.image_channel_data_type = CL_FLOAT;
-           array_format.image_channel_order     = CL_R;
-
-           nbp->nbfp_climg2d = clCreateImage2D(deviceContext.context(), CL_MEM_READ_ONLY |
-           CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
-         */
-
-        nbp->nbfp_climg2d = clCreateBuffer(
-                deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-                nnbfp * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp.data()), &cl_error);
-        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                           ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+        /* set up LJ parameter lookup table */
+        DeviceBuffer<real> nbfp;
+        initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext);
+        nbp->nbfp_climg2d = nbfp;
 
         if (ic->vdwtype == evdwPME)
         {
-            /* Switched from using textures to using buffers */
-            // TODO: decide which alternative is most efficient - textures or buffers.
-            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(deviceContext.context(), CL_MEM_READ_WRITE |
-               CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
-            nbp->nbfp_comb_climg2d =
-                    clCreateBuffer(deviceContext.context(),
-                                   CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-                                   nnbfp_comb * sizeof(cl_float),
-                                   const_cast<float*>(nbatParams.nbfp_comb.data()), &cl_error);
-            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            // TODO: improvement needed.
-            // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
-            // don't accept nullptr values for image2D parameters.
-            /* Switched from using textures to using buffers */
-            // TODO: decide which alternative is most efficient - textures or buffers.
-            /* nbp->nbfp_comb_climg2d = clCreateImage2D(deviceContext.context(),
-               CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(deviceContext.context(), CL_MEM_READ_ONLY,
-                                                    sizeof(cl_float), nullptr, &cl_error);
-            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+            DeviceBuffer<float> nbfp_comb;
+            initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext);
+            nbp->nbfp_comb_climg2d = nbfp_comb;
         }
     }
 }