pmeGpu->kernelParams->grid.realGridSize[YY] +
pmeGpu->kernelParams->grid.realGridSize[ZZ];
const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
- &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
+ &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
if (shouldRealloc)
{
/* Reallocate the host buffer */
{
const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_forces, nullptr, sizeof(float),
- &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, newForcesSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
+ &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
}
{
const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
- &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
+ &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
if (c_usePadding)
{
const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
- &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc,
- newCoefficientsSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
+ &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
if (c_usePadding)
const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
int currentSizeTemp = pmeGpu->archSpecific->splineDataSize;
int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_theta, nullptr, sizeof(float),
- ¤tSizeTemp, ¤tSizeTempAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true);
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
- &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
+ ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
+ &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
// the host side reallocation
if (shouldRealloc)
{
{
const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
- cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
- &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
+ &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
pfree(pmeGpu->staging.h_gridlineIndices);
pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
}
if (pmeGpu->archSpecific->performOutOfPlaceFFT)
{
/* 2 separate grids */
- cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
- &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc,
- newComplexGridSize, pmeGpu->archSpecific->pmeStream, true);
- cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
- &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc,
- newRealGridSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
+ &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
+ &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
}
else
{
/* A single buffer so that any grid will fit */
const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
- cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
- &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc,
- newGridsSize, pmeGpu->archSpecific->pmeStream, true);
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
+ &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
// the size might get used later for copying the grid
#include <cstdlib>
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/smalloc.h"
/*** Generic CUDA data operation wrappers ***/
return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
}
-/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
-
-/*!
- * Reallocation of the memory pointed by d_ptr and copying of the data from
- * the location pointed by h_src host-side pointer is done. Allocation is
- * buffered and therefore freeing is only needed if the previously allocated
- * space is not enough.
- * The H2D copy is launched in stream s and can be done synchronously or
- * asynchronously (the default is the latter).
- */
-void cu_realloc_buffered(void **d_dest, void *h_src,
- size_t type_size,
- int *curr_size, int *curr_alloc_size,
- int req_size,
- cudaStream_t s,
- bool bAsync = true)
-{
- cudaError_t stat;
-
- if (d_dest == NULL || req_size < 0)
- {
- return;
- }
-
- /* reallocate only if the data does not fit = allocation size is smaller
- than the current requested size */
- if (req_size > *curr_alloc_size)
- {
- /* only free if the array has already been initialized */
- if (*curr_alloc_size >= 0)
- {
- freeDeviceBuffer(d_dest);
- }
-
- *curr_alloc_size = over_alloc_large(req_size);
-
- stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
- CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered");
- }
-
- /* size could have changed without actual reallocation */
- *curr_size = req_size;
-
- /* upload to device */
- if (h_src)
- {
- if (bAsync)
- {
- cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
- }
- else
- {
- cu_copy_H2D_sync(*d_dest, h_src, *curr_size * type_size);
- }
- }
-}
-
/*! \brief Return whether texture objects are used on this device.
*
* \param[in] pointer to the GPU device info structure to inspect for texture objects support