From 049fdc5339e632f49a53e2264768ac93b1bdf48b Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Fri, 16 Feb 2018 16:12:51 +0100 Subject: [PATCH] Removed cu_realloc_buffered() in favor of reallocateDeviceBuffer() Change-Id: I5f0d374f8e7e22505265f3cebc60fe29948ab6e9 --- src/gromacs/ewald/pme.cu | 44 ++++++++++----------- src/gromacs/ewald/pme.cuh | 4 +- src/gromacs/gpu_utils/cudautils.cu | 59 ----------------------------- src/gromacs/gpu_utils/cudautils.cuh | 8 ---- 4 files changed, 22 insertions(+), 93 deletions(-) diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu index e1c3ce14ff..ecbad009ce 100644 --- a/src/gromacs/ewald/pme.cu +++ b/src/gromacs/ewald/pme.cu @@ -114,8 +114,8 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu) pmeGpu->kernelParams->grid.realGridSize[YY] + pmeGpu->kernelParams->grid.realGridSize[ZZ]; const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->grid.d_splineModuli, nullptr, sizeof(float), - &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize, + &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream); if (shouldRealloc) { /* Reallocate the host buffer */ @@ -141,8 +141,8 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu) { const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM; GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_forces, nullptr, sizeof(float), - &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, newForcesSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize, + &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream); pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc); pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms); } @@ -170,8 +170,8 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu) { const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM; GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU"); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coordinates, nullptr, sizeof(float), - &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize, + &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream); if (c_usePadding) { const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms; @@ -206,9 +206,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU"); const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc; GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU"); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coefficients, nullptr, sizeof(float), - &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, - newCoefficientsSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize, + &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream); cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); if (c_usePadding) @@ -239,10 +238,10 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu) const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize); int currentSizeTemp = pmeGpu->archSpecific->splineDataSize; int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc; - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_theta, nullptr, sizeof(float), - ¤tSizeTemp, ¤tSizeTempAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_dtheta, nullptr, sizeof(float), - &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, + ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->pmeStream); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize, + &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream); // the host side reallocation if (shouldRealloc) { @@ -266,8 +265,8 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu) { const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc; GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU"); - cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int), - &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize, + &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream); pfree(pmeGpu->staging.h_gridlineIndices); pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int)); } @@ -291,20 +290,17 @@ void pme_gpu_realloc_grids(PmeGpu *pmeGpu) if (pmeGpu->archSpecific->performOutOfPlaceFFT) { /* 2 separate grids */ - cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float), - &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, - newComplexGridSize, pmeGpu->archSpecific->pmeStream, true); - cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float), - &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, - newRealGridSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize, + &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream); + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize, + &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream); } else { /* A single buffer so that any grid will fit */ const int newGridsSize = std::max(newRealGridSize, newComplexGridSize); - cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float), - &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, - newGridsSize, pmeGpu->archSpecific->pmeStream, true); + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize, + &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream); kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid; pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize; // the size might get used later for copying the grid diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index 4139d46911..a2fd7cbd09 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -232,8 +232,8 @@ struct PmeGpuCuda /* GPU arrays element counts (not the arrays sizes in bytes!). * They might be larger than the actual meaningful data sizes. * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. - * These integer pairs are mostly meaningful for the cu_realloc_buffered calls. - * As such, if cu_realloc_buffered is refactored, they can be freely changed, too. + * These integer pairs are mostly meaningful for the reallocateDeviceBuffer calls. + * As such, if DeviceBuffer is refactored into a class, they can be freely changed, too. * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying. * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. */ diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu index 347f853929..b66be40f09 100644 --- a/src/gromacs/gpu_utils/cudautils.cu +++ b/src/gromacs/gpu_utils/cudautils.cu @@ -41,10 +41,8 @@ #include #include "gromacs/gpu_utils/cuda_arch_utils.cuh" -#include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/utility/gmxassert.h" -#include "gromacs/utility/smalloc.h" /*** Generic CUDA data operation wrappers ***/ @@ -136,63 +134,6 @@ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s); } -/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/ - -/*! - * Reallocation of the memory pointed by d_ptr and copying of the data from - * the location pointed by h_src host-side pointer is done. Allocation is - * buffered and therefore freeing is only needed if the previously allocated - * space is not enough. - * The H2D copy is launched in stream s and can be done synchronously or - * asynchronously (the default is the latter). - */ -void cu_realloc_buffered(void **d_dest, void *h_src, - size_t type_size, - int *curr_size, int *curr_alloc_size, - int req_size, - cudaStream_t s, - bool bAsync = true) -{ - cudaError_t stat; - - if (d_dest == NULL || req_size < 0) - { - return; - } - - /* reallocate only if the data does not fit = allocation size is smaller - than the current requested size */ - if (req_size > *curr_alloc_size) - { - /* only free if the array has already been initialized */ - if (*curr_alloc_size >= 0) - { - freeDeviceBuffer(d_dest); - } - - *curr_alloc_size = over_alloc_large(req_size); - - stat = cudaMalloc(d_dest, *curr_alloc_size * type_size); - CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered"); - } - - /* size could have changed without actual reallocation */ - *curr_size = req_size; - - /* upload to device */ - if (h_src) - { - if (bAsync) - { - cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s); - } - else - { - cu_copy_H2D_sync(*d_dest, h_src, *curr_size * type_size); - } - } -} - /*! \brief Return whether texture objects are used on this device. * * \param[in] pointer to the GPU device info structure to inspect for texture objects support diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 5a05e3b1fc..945d7bb1d7 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -202,14 +202,6 @@ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); -/*! Reallocates the device memory and copies data from the host. */ -void cu_realloc_buffered(void **d_dest, void *h_src, - size_t type_size, - int *curr_size, int *curr_alloc_size, - int req_size, - cudaStream_t s, - bool bAsync); - // TODO: the 2 functions below are pretty much a constructor/destructor of a simple // GPU table object. There is also almost self-contained fetchFromParamLookupTable() // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file. -- 2.22.0