Replace *_free_buffered calls by freeDeviceBuffer
authorAleksei Iupinov <a.yupinov@gmail.com>
Tue, 13 Feb 2018 16:49:40 +0000 (17:49 +0100)
committerSzilárd Páll <pall.szilard@gmail.com>
Sun, 18 Feb 2018 23:02:23 +0000 (00:02 +0100)
Change-Id: Iaabd4b6655615294943254a29a65507af7567ade

src/gromacs/ewald/pme.cu
src/gromacs/ewald/pme.cuh
src/gromacs/gpu_utils/cudautils.cu
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/gpu_utils/oclutils.h
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index bbc2bbaacc2e76b9abf2b4556377fa81bdfed4c7..1abc07aa0ecc25d38869b31cd59741d0be296e42 100644 (file)
@@ -133,8 +133,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
 void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu)
 {
     pfree(pmeGpu->staging.h_splineModuli);
-    cu_free_buffered(pmeGpu->kernelParams->grid.d_splineModuli, &pmeGpu->archSpecific->splineValuesSize,
-                     &pmeGpu->archSpecific->splineValuesSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
 }
 
 void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
@@ -149,7 +148,7 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
 
 void pme_gpu_free_forces(const PmeGpu *pmeGpu)
 {
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_forces, &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
 }
 
 void pme_gpu_copy_input_forces(PmeGpu *pmeGpu)
@@ -198,7 +197,7 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina
 
 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
 {
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_coordinates, &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
 }
 
 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const float *h_coefficients)
@@ -225,7 +224,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo
 
 void pme_gpu_free_coefficients(const PmeGpu *pmeGpu)
 {
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_coefficients, &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
 }
 
 void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
@@ -256,8 +255,8 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
 void pme_gpu_free_spline_data(const PmeGpu *pmeGpu)
 {
     /* Two arrays of the same size */
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_theta);
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_dtheta, &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
     pfree(pmeGpu->staging.h_theta);
     pfree(pmeGpu->staging.h_dtheta);
 }
@@ -274,7 +273,7 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
 
 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu)
 {
-    cu_free_buffered(pmeGpu->kernelParams->atoms.d_gridlineIndices, &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
     pfree(pmeGpu->staging.h_gridlineIndices);
 }
 
@@ -315,10 +314,9 @@ void pme_gpu_free_grids(const PmeGpu *pmeGpu)
 {
     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
     {
-        cu_free_buffered(pmeGpu->kernelParams->grid.d_fourierGrid);
+        freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
     }
-    cu_free_buffered(pmeGpu->kernelParams->grid.d_realGrid,
-                     &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc);
+    freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
 }
 
 void pme_gpu_clear_grids(const PmeGpu *pmeGpu)
index e58c3de4957dd5416027746c46875ce31ac9facf..0f37af38d6c4678e0128f4d2a464a7f29b8ad2cf 100644 (file)
@@ -186,8 +186,8 @@ struct PmeGpuCuda
     /* GPU arrays element counts (not the arrays sizes in bytes!).
      * They might be larger than the actual meaningful data sizes.
      * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory.
-     * These integer pairs are mostly meaningful for the cu_realloc/free_buffered calls.
-     * As such, if cu_realloc/free_buffered is refactored, they can be freely changed, too.
+     * These integer pairs are mostly meaningful for the cu_realloc_buffered calls.
+     * As such, if cu_realloc_buffered is refactored, they can be freely changed, too.
      * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying.
      * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
      */
index 53e204a2ab111cc8313d4572b5e79f9837e35d04..e466ef97290d6062a1ace68ac0e50abad77f0a22 100644 (file)
@@ -137,30 +137,6 @@ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s
 
 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 
-/*!
- * If the pointers to the size variables are NULL no resetting happens.
- */
-void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
-{
-    cudaError_t stat;
-
-    if (d_ptr)
-    {
-        stat = cudaFree(d_ptr);
-        CU_RET_ERR(stat, "cudaFree failed");
-    }
-
-    if (n)
-    {
-        *n = -1;
-    }
-
-    if (nalloc)
-    {
-        *nalloc = -1;
-    }
-}
-
 /*!
  *  Reallocation of the memory pointed by d_ptr and copying of the data from
  *  the location pointed by h_src host-side pointer is done. Allocation is
@@ -190,13 +166,13 @@ void cu_realloc_buffered(void **d_dest, void *h_src,
         /* only free if the array has already been initialized */
         if (*curr_alloc_size >= 0)
         {
-            cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
+            freeDeviceBuffer(d_dest);
         }
 
         *curr_alloc_size = over_alloc_large(req_size);
 
         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
-        CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
+        CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered");
     }
 
     /* size could have changed without actual reallocation */
index d74ecd4d8dea253fe7c4663428294a5890a396bc..a91df33e2d7bdf310cb0885a3c7acd38afafc50d 100644 (file)
@@ -202,9 +202,6 @@ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 /*! Launches asynchronous host to device memory copy in stream s. */
 int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
 
-/*! Frees device memory and resets the size and allocation size to -1. */
-void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL);
-
 /*! Reallocates the device memory and copies data from the host. */
 void cu_realloc_buffered(void **d_dest, void *h_src,
                          size_t type_size,
@@ -298,7 +295,9 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s)
 }
 
 /*! \brief Free a device-side buffer.
- * TODO: fully replace cu_free_buffered with this.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
  *
  * \param[in] buffer  Pointer to the buffer to free.
  */
index e4bc91b320430a01089f9eb854320126d8a92ee4..093fe31e19f61fc708bf595278742209955c969b 100644 (file)
@@ -178,7 +178,9 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
 }
 
 /*! \brief Free a device-side buffer.
- * TODO: fully replace free_ocl_buffer and ocl_free_buffered with this.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
  *
  * \param[in] buffer  Pointer to the buffer to free.
  */
index ea5375667d0c3783b65f48804d36937538572fcb..ea9b3b8fe4e616059cfe05c793f7c6a8167d9f84 100644 (file)
@@ -668,10 +668,10 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
         /* free up first if the arrays have already been initialized */
         if (d_atdat->nalloc != -1)
         {
-            cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
-            cu_free_buffered(d_atdat->xq);
-            cu_free_buffered(d_atdat->atom_types);
-            cu_free_buffered(d_atdat->lj_comb);
+            freeDeviceBuffer(&d_atdat->f);
+            freeDeviceBuffer(&d_atdat->xq);
+            freeDeviceBuffer(&d_atdat->atom_types);
+            freeDeviceBuffer(&d_atdat->lj_comb);
         }
 
         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
@@ -784,10 +784,10 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     stat = cudaFree(atdat->e_el);
     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
 
-    cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
-    cu_free_buffered(atdat->xq);
-    cu_free_buffered(atdat->atom_types, &atdat->ntypes);
-    cu_free_buffered(atdat->lj_comb);
+    freeDeviceBuffer(&atdat->f);
+    freeDeviceBuffer(&atdat->xq);
+    freeDeviceBuffer(&atdat->atom_types);
+    freeDeviceBuffer(&atdat->lj_comb);
 
     /* Free plist */
     auto *plist = nb->plist[eintLocal];
index 189c06944f064df9659a349f20d12e2088594198..2489efa85b9260aef45989e25b95f65da58c8759 100644 (file)
@@ -94,32 +94,6 @@ bool useLjCombRule(int vdwType)
             vdwType == evdwOclCUTCOMBLB);
 }
 
-/*! \brief Free device buffers
- *
- * If the pointers to the size variables are NULL no resetting happens.
- */
-static void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
-{
-    cl_int gmx_unused cl_error;
-
-    if (d_ptr)
-    {
-        cl_error = clReleaseMemObject(d_ptr);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
-    }
-
-    if (n)
-    {
-        *n = -1;
-    }
-
-    if (nalloc)
-    {
-        *nalloc = -1;
-    }
-}
-
 /*! \brief Reallocation device buffers
  *
  *  Reallocation of the memory pointed by d_ptr and copying of the data from
@@ -156,7 +130,7 @@ static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
         /* only free if the array has already been initialized */
         if (*curr_alloc_size >= 0)
         {
-            ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
+            freeDeviceBuffer(d_dest);
         }
 
         *curr_alloc_size = over_alloc_large(req_size);
@@ -183,21 +157,6 @@ static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
     }
 }
 
-/*! \brief Releases the input OpenCL buffer */
-static void free_ocl_buffer(cl_mem *buffer)
-{
-    cl_int gmx_unused cl_error;
-
-    assert(NULL != buffer);
-
-    if (*buffer)
-    {
-        cl_error = clReleaseMemObject(*buffer);
-        assert(CL_SUCCESS == cl_error);
-        *buffer = NULL;
-    }
-}
-
 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
  * and the table GPU array.
  *
@@ -214,7 +173,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
 
     if (nbp->coulomb_tab_climg2d != NULL)
     {
-        free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
+        freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
     }
 
     /* Switched from using textures to using buffers */
@@ -1010,10 +969,10 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
         /* free up first if the arrays have already been initialized */
         if (d_atdat->nalloc != -1)
         {
-            ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
-            ocl_free_buffered(d_atdat->xq, NULL, NULL);
-            ocl_free_buffered(d_atdat->lj_comb, NULL, NULL);
-            ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
+            freeDeviceBuffer(&d_atdat->f);
+            freeDeviceBuffer(&d_atdat->xq);
+            freeDeviceBuffer(&d_atdat->lj_comb);
+            freeDeviceBuffer(&d_atdat->atom_types);
         }
 
         d_atdat->f_elem_size = sizeof(rvec);
@@ -1163,20 +1122,20 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     free_kernel(&(nb->kernel_zero_e_fshift));
 
     /* Free atdat */
-    free_ocl_buffer(&(nb->atdat->xq));
-    free_ocl_buffer(&(nb->atdat->f));
-    free_ocl_buffer(&(nb->atdat->e_lj));
-    free_ocl_buffer(&(nb->atdat->e_el));
-    free_ocl_buffer(&(nb->atdat->fshift));
-    free_ocl_buffer(&(nb->atdat->lj_comb));
-    free_ocl_buffer(&(nb->atdat->atom_types));
-    free_ocl_buffer(&(nb->atdat->shift_vec));
+    freeDeviceBuffer(&(nb->atdat->xq));
+    freeDeviceBuffer(&(nb->atdat->f));
+    freeDeviceBuffer(&(nb->atdat->e_lj));
+    freeDeviceBuffer(&(nb->atdat->e_el));
+    freeDeviceBuffer(&(nb->atdat->fshift));
+    freeDeviceBuffer(&(nb->atdat->lj_comb));
+    freeDeviceBuffer(&(nb->atdat->atom_types));
+    freeDeviceBuffer(&(nb->atdat->shift_vec));
     sfree(nb->atdat);
 
     /* Free nbparam */
-    free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
-    free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
-    free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
+    freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
+    freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
+    freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
     sfree(nb->nbparam);
 
     /* Free plist */
@@ -1207,7 +1166,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     nb->nbst.fshift = NULL;
 
     /* Free debug buffer */
-    free_ocl_buffer(&nb->debug_buffer);
+    freeDeviceBuffer(&nb->debug_buffer);
 
     /* Free command queues */
     clReleaseCommandQueue(nb->stream[eintLocal]);