From ec8d539762b3c32d9ee6b8d2fc64010fb2f3141e Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Tue, 13 Feb 2018 17:49:40 +0100 Subject: [PATCH] Replace *_free_buffered calls by freeDeviceBuffer Change-Id: Iaabd4b6655615294943254a29a65507af7567ade --- src/gromacs/ewald/pme.cu | 20 +++-- src/gromacs/ewald/pme.cuh | 4 +- src/gromacs/gpu_utils/cudautils.cu | 28 +------ src/gromacs/gpu_utils/cudautils.cuh | 7 +- src/gromacs/gpu_utils/oclutils.h | 4 +- .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 16 ++-- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 77 +++++-------------- 7 files changed, 45 insertions(+), 111 deletions(-) diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu index bbc2bbaacc..1abc07aa0e 100644 --- a/src/gromacs/ewald/pme.cu +++ b/src/gromacs/ewald/pme.cu @@ -133,8 +133,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu) void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu) { pfree(pmeGpu->staging.h_splineModuli); - cu_free_buffered(pmeGpu->kernelParams->grid.d_splineModuli, &pmeGpu->archSpecific->splineValuesSize, - &pmeGpu->archSpecific->splineValuesSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli); } void pme_gpu_realloc_forces(PmeGpu *pmeGpu) @@ -149,7 +148,7 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu) void pme_gpu_free_forces(const PmeGpu *pmeGpu) { - cu_free_buffered(pmeGpu->kernelParams->atoms.d_forces, &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces); } void pme_gpu_copy_input_forces(PmeGpu *pmeGpu) @@ -198,7 +197,7 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina void pme_gpu_free_coordinates(const PmeGpu *pmeGpu) { - cu_free_buffered(pmeGpu->kernelParams->atoms.d_coordinates, &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates); } void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const float *h_coefficients) @@ -225,7 +224,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo void pme_gpu_free_coefficients(const PmeGpu *pmeGpu) { - cu_free_buffered(pmeGpu->kernelParams->atoms.d_coefficients, &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients); } void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu) @@ -256,8 +255,8 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu) void pme_gpu_free_spline_data(const PmeGpu *pmeGpu) { /* Two arrays of the same size */ - cu_free_buffered(pmeGpu->kernelParams->atoms.d_theta); - cu_free_buffered(pmeGpu->kernelParams->atoms.d_dtheta, &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta); pfree(pmeGpu->staging.h_theta); pfree(pmeGpu->staging.h_dtheta); } @@ -274,7 +273,7 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu) void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu) { - cu_free_buffered(pmeGpu->kernelParams->atoms.d_gridlineIndices, &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices); pfree(pmeGpu->staging.h_gridlineIndices); } @@ -315,10 +314,9 @@ void pme_gpu_free_grids(const PmeGpu *pmeGpu) { if (pmeGpu->archSpecific->performOutOfPlaceFFT) { - cu_free_buffered(pmeGpu->kernelParams->grid.d_fourierGrid); + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid); } - cu_free_buffered(pmeGpu->kernelParams->grid.d_realGrid, - &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc); + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid); } void pme_gpu_clear_grids(const PmeGpu *pmeGpu) diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index e58c3de495..0f37af38d6 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -186,8 +186,8 @@ struct PmeGpuCuda /* GPU arrays element counts (not the arrays sizes in bytes!). * They might be larger than the actual meaningful data sizes. * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. - * These integer pairs are mostly meaningful for the cu_realloc/free_buffered calls. - * As such, if cu_realloc/free_buffered is refactored, they can be freely changed, too. + * These integer pairs are mostly meaningful for the cu_realloc_buffered calls. + * As such, if cu_realloc_buffered is refactored, they can be freely changed, too. * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying. * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. */ diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu index 53e204a2ab..e466ef9729 100644 --- a/src/gromacs/gpu_utils/cudautils.cu +++ b/src/gromacs/gpu_utils/cudautils.cu @@ -137,30 +137,6 @@ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/ -/*! - * If the pointers to the size variables are NULL no resetting happens. - */ -void cu_free_buffered(void *d_ptr, int *n, int *nalloc) -{ - cudaError_t stat; - - if (d_ptr) - { - stat = cudaFree(d_ptr); - CU_RET_ERR(stat, "cudaFree failed"); - } - - if (n) - { - *n = -1; - } - - if (nalloc) - { - *nalloc = -1; - } -} - /*! * Reallocation of the memory pointed by d_ptr and copying of the data from * the location pointed by h_src host-side pointer is done. Allocation is @@ -190,13 +166,13 @@ void cu_realloc_buffered(void **d_dest, void *h_src, /* only free if the array has already been initialized */ if (*curr_alloc_size >= 0) { - cu_free_buffered(*d_dest, curr_size, curr_alloc_size); + freeDeviceBuffer(d_dest); } *curr_alloc_size = over_alloc_large(req_size); stat = cudaMalloc(d_dest, *curr_alloc_size * type_size); - CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered"); + CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered"); } /* size could have changed without actual reallocation */ diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index d74ecd4d8d..a91df33e2d 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -202,9 +202,6 @@ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); -/*! Frees device memory and resets the size and allocation size to -1. */ -void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL); - /*! Reallocates the device memory and copies data from the host. */ void cu_realloc_buffered(void **d_dest, void *h_src, size_t type_size, @@ -298,7 +295,9 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s) } /*! \brief Free a device-side buffer. - * TODO: fully replace cu_free_buffered with this. + * This does not reset separately stored size/capacity integers, + * as this is planned to be a destructor of DeviceBuffer as a proper class, + * and no calls on \p buffer should be made afterwards. * * \param[in] buffer Pointer to the buffer to free. */ diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index e4bc91b320..093fe31e19 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -178,7 +178,9 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s) } /*! \brief Free a device-side buffer. - * TODO: fully replace free_ocl_buffer and ocl_free_buffered with this. + * This does not reset separately stored size/capacity integers, + * as this is planned to be a destructor of DeviceBuffer as a proper class, + * and no calls on \p buffer should be made afterwards. * * \param[in] buffer Pointer to the buffer to free. */ diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index ea5375667d..ea9b3b8fe4 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -668,10 +668,10 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, /* free up first if the arrays have already been initialized */ if (d_atdat->nalloc != -1) { - cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc); - cu_free_buffered(d_atdat->xq); - cu_free_buffered(d_atdat->atom_types); - cu_free_buffered(d_atdat->lj_comb); + freeDeviceBuffer(&d_atdat->f); + freeDeviceBuffer(&d_atdat->xq); + freeDeviceBuffer(&d_atdat->atom_types); + freeDeviceBuffer(&d_atdat->lj_comb); } stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f)); @@ -784,10 +784,10 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) stat = cudaFree(atdat->e_el); CU_RET_ERR(stat, "cudaFree failed on atdat->e_el"); - cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc); - cu_free_buffered(atdat->xq); - cu_free_buffered(atdat->atom_types, &atdat->ntypes); - cu_free_buffered(atdat->lj_comb); + freeDeviceBuffer(&atdat->f); + freeDeviceBuffer(&atdat->xq); + freeDeviceBuffer(&atdat->atom_types); + freeDeviceBuffer(&atdat->lj_comb); /* Free plist */ auto *plist = nb->plist[eintLocal]; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index 189c06944f..2489efa85b 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -94,32 +94,6 @@ bool useLjCombRule(int vdwType) vdwType == evdwOclCUTCOMBLB); } -/*! \brief Free device buffers - * - * If the pointers to the size variables are NULL no resetting happens. - */ -static void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc) -{ - cl_int gmx_unused cl_error; - - if (d_ptr) - { - cl_error = clReleaseMemObject(d_ptr); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors - } - - if (n) - { - *n = -1; - } - - if (nalloc) - { - *nalloc = -1; - } -} - /*! \brief Reallocation device buffers * * Reallocation of the memory pointed by d_ptr and copying of the data from @@ -156,7 +130,7 @@ static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src, /* only free if the array has already been initialized */ if (*curr_alloc_size >= 0) { - ocl_free_buffered(*d_dest, curr_size, curr_alloc_size); + freeDeviceBuffer(d_dest); } *curr_alloc_size = over_alloc_large(req_size); @@ -183,21 +157,6 @@ static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src, } } -/*! \brief Releases the input OpenCL buffer */ -static void free_ocl_buffer(cl_mem *buffer) -{ - cl_int gmx_unused cl_error; - - assert(NULL != buffer); - - if (*buffer) - { - cl_error = clReleaseMemObject(*buffer); - assert(CL_SUCCESS == cl_error); - *buffer = NULL; - } -} - /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale * and the table GPU array. * @@ -214,7 +173,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic, if (nbp->coulomb_tab_climg2d != NULL) { - free_ocl_buffer(&(nbp->coulomb_tab_climg2d)); + freeDeviceBuffer(&(nbp->coulomb_tab_climg2d)); } /* Switched from using textures to using buffers */ @@ -1010,10 +969,10 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, /* free up first if the arrays have already been initialized */ if (d_atdat->nalloc != -1) { - ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc); - ocl_free_buffered(d_atdat->xq, NULL, NULL); - ocl_free_buffered(d_atdat->lj_comb, NULL, NULL); - ocl_free_buffered(d_atdat->atom_types, NULL, NULL); + freeDeviceBuffer(&d_atdat->f); + freeDeviceBuffer(&d_atdat->xq); + freeDeviceBuffer(&d_atdat->lj_comb); + freeDeviceBuffer(&d_atdat->atom_types); } d_atdat->f_elem_size = sizeof(rvec); @@ -1163,20 +1122,20 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) free_kernel(&(nb->kernel_zero_e_fshift)); /* Free atdat */ - free_ocl_buffer(&(nb->atdat->xq)); - free_ocl_buffer(&(nb->atdat->f)); - free_ocl_buffer(&(nb->atdat->e_lj)); - free_ocl_buffer(&(nb->atdat->e_el)); - free_ocl_buffer(&(nb->atdat->fshift)); - free_ocl_buffer(&(nb->atdat->lj_comb)); - free_ocl_buffer(&(nb->atdat->atom_types)); - free_ocl_buffer(&(nb->atdat->shift_vec)); + freeDeviceBuffer(&(nb->atdat->xq)); + freeDeviceBuffer(&(nb->atdat->f)); + freeDeviceBuffer(&(nb->atdat->e_lj)); + freeDeviceBuffer(&(nb->atdat->e_el)); + freeDeviceBuffer(&(nb->atdat->fshift)); + freeDeviceBuffer(&(nb->atdat->lj_comb)); + freeDeviceBuffer(&(nb->atdat->atom_types)); + freeDeviceBuffer(&(nb->atdat->shift_vec)); sfree(nb->atdat); /* Free nbparam */ - free_ocl_buffer(&(nb->nbparam->nbfp_climg2d)); - free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d)); - free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d)); + freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d)); + freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d)); + freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d)); sfree(nb->nbparam); /* Free plist */ @@ -1207,7 +1166,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) nb->nbst.fshift = NULL; /* Free debug buffer */ - free_ocl_buffer(&nb->debug_buffer); + freeDeviceBuffer(&nb->debug_buffer); /* Free command queues */ clReleaseCommandQueue(nb->stream[eintLocal]); -- 2.22.0