From 049fdc5339e632f49a53e2264768ac93b1bdf48b Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Fri, 16 Feb 2018 16:12:51 +0100
Subject: [PATCH] Removed cu_realloc_buffered() in favor of
 reallocateDeviceBuffer()

Change-Id: I5f0d374f8e7e22505265f3cebc60fe29948ab6e9
---
 src/gromacs/ewald/pme.cu            | 44 ++++++++++-----------
 src/gromacs/ewald/pme.cuh           |  4 +-
 src/gromacs/gpu_utils/cudautils.cu  | 59 -----------------------------
 src/gromacs/gpu_utils/cudautils.cuh |  8 ----
 4 files changed, 22 insertions(+), 93 deletions(-)

diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
index e1c3ce14ff..ecbad009ce 100644
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -114,8 +114,8 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
         pmeGpu->kernelParams->grid.realGridSize[YY] +
         pmeGpu->kernelParams->grid.realGridSize[ZZ];
     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
-                        &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
+                           &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
     if (shouldRealloc)
     {
         /* Reallocate the host buffer */
@@ -141,8 +141,8 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
 {
     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_forces, nullptr, sizeof(float),
-                        &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, newForcesSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
+                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
     pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
     pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
 }
@@ -170,8 +170,8 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
 {
     const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
-                        &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
+                           &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
     if (c_usePadding)
     {
         const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
@@ -206,9 +206,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo
     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
-                        &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc,
-                        newCoefficientsSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
+                           &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
                 pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
     if (c_usePadding)
@@ -239,10 +238,10 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_theta, nullptr, sizeof(float),
-                        &currentSizeTemp, &currentSizeTempAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true);
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
-                        &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
+                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
+                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
     // the host side reallocation
     if (shouldRealloc)
     {
@@ -266,8 +265,8 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
 {
     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
-    cu_realloc_buffered((void **)&pmeGpu->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
-                        &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGpu->archSpecific->pmeStream, true);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
+                           &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
     pfree(pmeGpu->staging.h_gridlineIndices);
     pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
 }
@@ -291,20 +290,17 @@ void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
     {
         /* 2 separate grids */
-        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
-                            &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc,
-                            newComplexGridSize, pmeGpu->archSpecific->pmeStream, true);
-        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
-                            &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc,
-                            newRealGridSize, pmeGpu->archSpecific->pmeStream, true);
+        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
+                               &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
     }
     else
     {
         /* A single buffer so that any grid will fit */
         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
-        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
-                            &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc,
-                            newGridsSize, pmeGpu->archSpecific->pmeStream, true);
+        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
         // the size might get used later for copying the grid
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh
index 4139d46911..a2fd7cbd09 100644
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -232,8 +232,8 @@ struct PmeGpuCuda
     /* GPU arrays element counts (not the arrays sizes in bytes!).
      * They might be larger than the actual meaningful data sizes.
      * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory.
-     * These integer pairs are mostly meaningful for the cu_realloc_buffered calls.
-     * As such, if cu_realloc_buffered is refactored, they can be freely changed, too.
+     * These integer pairs are mostly meaningful for the reallocateDeviceBuffer calls.
+     * As such, if DeviceBuffer is refactored into a class, they can be freely changed, too.
      * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying.
      * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
      */
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu
index 347f853929..b66be40f09 100644
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,10 +41,8 @@
 #include <cstdlib>
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/utility/gmxassert.h"
-#include "gromacs/utility/smalloc.h"
 
 /*** Generic CUDA data operation wrappers ***/
 
@@ -136,63 +134,6 @@ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s
     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 }
 
-/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
-
-/*!
- *  Reallocation of the memory pointed by d_ptr and copying of the data from
- *  the location pointed by h_src host-side pointer is done. Allocation is
- *  buffered and therefore freeing is only needed if the previously allocated
- *  space is not enough.
- *  The H2D copy is launched in stream s and can be done synchronously or
- *  asynchronously (the default is the latter).
- */
-void cu_realloc_buffered(void **d_dest, void *h_src,
-                         size_t type_size,
-                         int *curr_size, int *curr_alloc_size,
-                         int req_size,
-                         cudaStream_t s,
-                         bool bAsync = true)
-{
-    cudaError_t stat;
-
-    if (d_dest == NULL || req_size < 0)
-    {
-        return;
-    }
-
-    /* reallocate only if the data does not fit = allocation size is smaller
-       than the current requested size */
-    if (req_size > *curr_alloc_size)
-    {
-        /* only free if the array has already been initialized */
-        if (*curr_alloc_size >= 0)
-        {
-            freeDeviceBuffer(d_dest);
-        }
-
-        *curr_alloc_size = over_alloc_large(req_size);
-
-        stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
-        CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered");
-    }
-
-    /* size could have changed without actual reallocation */
-    *curr_size = req_size;
-
-    /* upload to device */
-    if (h_src)
-    {
-        if (bAsync)
-        {
-            cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
-        }
-        else
-        {
-            cu_copy_H2D_sync(*d_dest, h_src,  *curr_size * type_size);
-        }
-    }
-}
-
 /*! \brief Return whether texture objects are used on this device.
  *
  * \param[in]   pointer to the GPU device info structure to inspect for texture objects support
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 5a05e3b1fc..945d7bb1d7 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -202,14 +202,6 @@ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 /*! Launches asynchronous host to device memory copy in stream s. */
 int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
 
-/*! Reallocates the device memory and copies data from the host. */
-void cu_realloc_buffered(void **d_dest, void *h_src,
-                         size_t type_size,
-                         int *curr_size, int *curr_alloc_size,
-                         int req_size,
-                         cudaStream_t s,
-                         bool bAsync);
-
 // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
 // GPU table object. There is also almost self-contained fetchFromParamLookupTable()
 // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file.
-- 
2.22.0