From 59ebd3a3bb4a148f5406df7a122f9e955eead120 Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Tue, 13 Feb 2018 13:08:13 +0100
Subject: [PATCH] DeviceBuffer headers are added

For now they contain templated versions of GPU memory (re-)allocation
and host-to-device copy routines. The DeviceBuffer declarations
and the freeDeviceBuffer() implementations are moved there as well.
Using those, the GPU pairlist allocation code is made look
mostly the same in CUDA and OpenCL. ocl_realloc_buffered() is
removed; cu_realloc_buffered() is to go next.

Change-Id: Ic724113d7d0ed6b4707d094010e11ccbcd5da3c5
---
 src/gromacs/ewald/pme.cu                      |   1 +
 src/gromacs/gpu_utils/cudautils.cu            |   1 +
 src/gromacs/gpu_utils/cudautils.cuh           |  17 --
 src/gromacs/gpu_utils/devicebuffer.cuh        | 144 ++++++++++++++++
 src/gromacs/gpu_utils/devicebuffer.h          | 104 +++++++++++
 src/gromacs/gpu_utils/devicebuffer_ocl.h      | 163 ++++++++++++++++++
 src/gromacs/gpu_utils/gpu_utils.cu            |   2 +-
 src/gromacs/gpu_utils/gpu_utils.h             |   2 +-
 src/gromacs/gpu_utils/gpuregiontimer.cuh      |   9 +
 src/gromacs/gpu_utils/gputraits.cuh           |   5 +-
 src/gromacs/gpu_utils/gputraits_ocl.h         |   5 +-
 src/gromacs/gpu_utils/oclutils.h              |  17 --
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  42 ++---
 .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h       |   2 +
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   | 110 +++---------
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h |   1 +
 16 files changed, 476 insertions(+), 149 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/devicebuffer.cuh
 create mode 100644 src/gromacs/gpu_utils/devicebuffer.h
 create mode 100644 src/gromacs/gpu_utils/devicebuffer_ocl.h
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
index acafc3b94c..e1c3ce14ff 100644
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -48,6 +48,7 @@
 #include "pme.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/smalloc.h"
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu
index e466ef9729..347f853929 100644
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,6 +41,7 @@
 #include <cstdlib>
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/smalloc.h"
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index a91df33e2d..5a05e3b1fc 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -294,21 +294,4 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s)
     return true;
 }
 
-/*! \brief Free a device-side buffer.
- * This does not reset separately stored size/capacity integers,
- * as this is planned to be a destructor of DeviceBuffer as a proper class,
- * and no calls on \p buffer should be made afterwards.
- *
- * \param[in] buffer  Pointer to the buffer to free.
- */
-template <typename DeviceBuffer>
-void freeDeviceBuffer(DeviceBuffer *buffer)
-{
-    GMX_ASSERT(buffer, "needs a buffer pointer");
-    if (*buffer)
-    {
-        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
-    }
-}
-
 #endif
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
new file mode 100644
index 0000000000..9268dba550
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -0,0 +1,144 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_CUH
+#define GMX_GPU_UTILS_DEVICEBUFFER_CUH
+
+/*! \libinternal \file
+ *  \brief Implements the DeviceBuffer type and routines for CUDA.
+ *  Should only be included directly by the main DeviceBuffer file devicebuffer.h.
+ *  TODO: the intent is for DeviceBuffer to become a class.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+//! \brief A device-side buffer of ValueTypes
+template<typename ValueType>
+using DeviceBuffer = ValueType *;
+
+/*! \brief
+ * Allocates a device-side buffer.
+ * It is currently a caller's responsibility to call it only on not-yet allocated buffers.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer.
+ * \param[in]     numValues            Number of values to accomodate.
+ * \param[in]     context              The buffer's dummy context - not managed explicitly in CUDA RT.
+ */
+template <typename ValueType>
+void allocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                          size_t                   numValues,
+                          Context                  /* context */)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    cudaError_t stat = cudaMalloc((void **)buffer, numValues * sizeof(ValueType));
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaMalloc failure");
+}
+
+/*! \brief
+ * Frees a device-side buffer.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
+ *
+ * \param[in] buffer  Pointer to the buffer to free.
+ */
+template <typename DeviceBuffer>
+void freeDeviceBuffer(DeviceBuffer *buffer)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    if (*buffer)
+    {
+        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
+    }
+}
+
+/*! \brief
+ * Performs the host-to-device data copy, synchronous or asynchronously on request.
+ *
+ * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer
+ * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
+ * \param[in]     startingValueIndex   Offset (in values) at the device-side buffer to copy into.
+ * \param[in]     numValues            Number of values to copy.
+ * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     transferKind         Copy type: synchronous or asynchronous.
+ * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
+ *                                     Not used in CUDA implementation.
+ */
+template <typename ValueType>
+void copyToDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                        const ValueType         *hostBuffer,
+                        size_t                   startingValueIndex,
+                        size_t                   numValues,
+                        CommandStream            stream,
+                        GpuApiCallBehavior       transferKind,
+                        CommandEvent             */*timingEvent*/)
+{
+    if (numValues == 0)
+    {
+        return; // such calls are actually made with empty domains
+    }
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
+    cudaError_t  stat;
+    const size_t bytes = numValues * sizeof(ValueType);
+
+    switch (transferKind)
+    {
+        case GpuApiCallBehavior::Async:
+            GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA");
+            stat = cudaMemcpyAsync(*((ValueType **)buffer) + startingValueIndex, hostBuffer, bytes, cudaMemcpyHostToDevice, stream);
+            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(*((ValueType **)buffer) + startingValueIndex, hostBuffer, bytes, cudaMemcpyHostToDevice);
+            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous H2D copy failed");
+            break;
+
+        default:
+            throw;
+    }
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/devicebuffer.h b/src/gromacs/gpu_utils/devicebuffer.h
new file mode 100644
index 0000000000..bfd0a7f801
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer.h
@@ -0,0 +1,104 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_H
+#define GMX_GPU_UTILS_DEVICEBUFFER_H
+
+/*! \libinternal \file
+ *  \brief Implements the logic for handling of DeviceBuffer types in OpenCL/CUDA.
+ *  Can only be included on GPU build paths.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "config.h"
+
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h" // TODO: this is only for over_alloc_large
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/devicebuffer.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/devicebuffer_ocl.h"
+#else
+// cppcheck-suppress preprocessorErrorDirective
+#error "devicebuffer.h included on non-GPU build!"
+#endif
+
+/*! \brief
+ *  Reallocates the device-side buffer.
+ *
+ *  Reallocates the device-side memory pointed by \p buffer.
+ *  Allocation is buffered and therefore freeing is only needed
+ *  if the previously allocated space is not enough.
+ *  \p currentNumValues and \p currentMaxNumValues are updated.
+ *  TODO: \p currentNumValues, \p currentMaxNumValues, \p context
+ *  should all be encapsulated in a host-side class together with the buffer.
+ *
+ *  \tparam        ValueType            Raw value type of the \p buffer.
+ *  \param[in,out] buffer               Pointer to the device-side buffer
+ *  \param[in]     numValues            Number of values to accomodate.
+ *  \param[in,out] currentNumValues     The pointer to the buffer's number of values.
+ *  \param[in,out] currentMaxNumValues  The pointer to the buffer's capacity.
+ *  \param[in]     context              The buffer's context.
+ */
+template <typename ValueType>
+void reallocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                            size_t                   numValues,
+                            int                     *currentNumValues,
+                            int                     *currentMaxNumValues,
+                            Context                  context)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(currentNumValues, "needs a size pointer");
+    GMX_ASSERT(currentMaxNumValues, "needs a capacity pointer");
+
+    /* reallocate only if the data does not fit */
+    if (static_cast<int>(numValues) > *currentMaxNumValues)
+    {
+        if (*currentMaxNumValues >= 0)
+        {
+            freeDeviceBuffer(buffer);
+        }
+
+        *currentMaxNumValues = over_alloc_large(numValues);
+        allocateDeviceBuffer(buffer, *currentMaxNumValues, context);
+    }
+    /* size could have changed without actual reallocation */
+    *currentNumValues = numValues;
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
new file mode 100644
index 0000000000..fb0fbcc27e
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -0,0 +1,163 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_OCL_H
+#define GMX_GPU_UTILS_DEVICEBUFFER_OCL_H
+
+/*! \libinternal \file
+ *  \brief Implements the DeviceBuffer type and routines for OpenCL.
+ *  Should only be included directly by the main DeviceBuffer file devicebuffer.h.
+ *  TODO: the intent is for DeviceBuffer to become a class.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A minimal cl_mem wrapper that remembers its allocation type.
+ * The only point is making template type deduction possible.
+ */
+template<typename ValueType>
+class TypedClMemory
+{
+    private:
+        //! \brief Underlying data - not nulled right here only because we still have some snew()'s around
+        cl_mem data_;
+    public:
+        //! \brief An assignment operator - the purpose is to make allocation/zeroing work
+        void operator=(cl_mem data){data_ = data; }
+        //! \brief Returns underlying cl_mem transparently
+        operator cl_mem() {return data_; }
+};
+
+//! \libinternal \brief A device-side buffer of ValueTypes
+template<typename ValueType>
+using DeviceBuffer = TypedClMemory<ValueType>;
+
+/*! \libinternal \brief
+ * Allocates a device-side buffer.
+ * It is currently a caller's responsibility to call it only on not-yet allocated buffers.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer.
+ * \param[in]     numValues            Number of values to accomodate.
+ * \param[in]     context              The buffer's context-to-be.
+ */
+template <typename ValueType>
+void allocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                          size_t                   numValues,
+                          Context                  context)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    void  *hostPtr = nullptr;
+    cl_int clError;
+    *buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, numValues * sizeof(ValueType), hostPtr, &clError);
+    GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "clCreateBuffer failure");
+}
+
+/*! \brief
+ * Frees a device-side buffer.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
+ *
+ * \param[in] buffer  Pointer to the buffer to free.
+ */
+template <typename DeviceBuffer>
+void freeDeviceBuffer(DeviceBuffer *buffer)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    if (*buffer)
+    {
+        GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
+    }
+}
+
+/*! \brief
+ * Performs the host-to-device data copy, synchronous or asynchronously on request.
+ *
+ * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer
+ * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
+ * \param[in]     startingValueIndex   Offset (in values) at the device-side buffer to copy into.
+ * \param[in]     numValues            Number of values to copy.
+ * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     transferKind         Copy type: synchronous or asynchronous.
+ * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
+ *                                     If the pointer is not null, the event can further be used
+ *                                     to queue a wait for this operation or to query profiling information.
+ */
+template <typename ValueType>
+void copyToDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                        const ValueType         *hostBuffer,
+                        size_t                   startingValueIndex,
+                        size_t                   numValues,
+                        CommandStream            stream,
+                        GpuApiCallBehavior       transferKind,
+                        CommandEvent            *timingEvent)
+{
+    if (numValues == 0)
+    {
+        return; // such calls are actually made with empty domains
+    }
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
+    cl_int       clError;
+    const size_t offset = startingValueIndex * sizeof(ValueType);
+    const size_t bytes  = numValues * sizeof(ValueType);
+    switch (transferKind)
+    {
+        case GpuApiCallBehavior::Async:
+            clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0, nullptr, timingEvent);
+            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous H2D copy failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0, nullptr, timingEvent);
+            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous H2D copy failed");
+            break;
+
+        default:
+            throw;
+    }
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 7660d7cc24..c5e367884d 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -137,7 +137,7 @@ static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
     }
 }
 
-bool isHostMemoryPinned(void *h_ptr)
+bool isHostMemoryPinned(const void *h_ptr)
 {
     cudaPointerAttributes memoryAttributes;
     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 9634c38034..35921e8446 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -271,6 +271,6 @@ void stopGpuProfiler(void) CUDA_FUNC_TERM
 
 //! Tells whether the host buffer was pinned for non-blocking transfers. Only implemented for CUDA.
 CUDA_FUNC_QUALIFIER
-bool isHostMemoryPinned(void *CUDA_FUNC_ARGUMENT(h_ptr)) CUDA_FUNC_TERM_WITH_RETURN(false)
+bool isHostMemoryPinned(const void *CUDA_FUNC_ARGUMENT(h_ptr)) CUDA_FUNC_TERM_WITH_RETURN(false)
 
 #endif
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh
index 17321f7c62..393bd7f726 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -102,6 +102,15 @@ class GpuRegionTimerImpl
 
         /*! \brief Resets internal state */
         inline void reset(){}
+
+        /*! \brief Returns a new raw timing event
+         * for passing into individual GPU API calls.
+         * This is just a dummy in CUDA.
+         */
+        inline CommandEvent *fetchNextEvent()
+        {
+            return nullptr;
+        }
 };
 
 //! Short-hand for external use
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh
index 9e96830a20..323f80eb9a 100644
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -46,8 +46,7 @@
 using CommandStream = cudaStream_t;
 //! \brief Single GPU call timing event - meaningless in CUDA
 using CommandEvent  = void;
-//! \brief A device-side buffer of ValueTypes
-template<typename ValueType>
-using DeviceBuffer  = ValueType *;
+//! \brief Context used explicitly in OpenCL, does nothing in CUDA
+using Context       = void *;
 
 #endif
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h
index 9d8575788d..adf6f9508f 100644
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -48,8 +48,7 @@
 using CommandStream = cl_command_queue;
 //! \brief Single GPU call timing event
 using CommandEvent  = cl_event;
-//! \brief A device-side buffer of ValueTypes
-template<typename ValueType>
-using DeviceBuffer  = cl_mem;
+//! \brief Context used explicitly in OpenCL
+using Context       = cl_context;
 
 #endif
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 093fe31e19..cc71d888f4 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -177,21 +177,4 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
     return false;
 }
 
-/*! \brief Free a device-side buffer.
- * This does not reset separately stored size/capacity integers,
- * as this is planned to be a destructor of DeviceBuffer as a proper class,
- * and no calls on \p buffer should be made afterwards.
- *
- * \param[in] buffer  Pointer to the buffer to free.
- */
-template <typename DeviceBuffer>
-void freeDeviceBuffer(DeviceBuffer *buffer)
-{
-    GMX_ASSERT(buffer, "needs a buffer pointer");
-    if (*buffer)
-    {
-        GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
-    }
-}
-
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index ea9b3b8fe4..e6c8e531ad 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -557,26 +557,28 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
         nb->timers->didPairlistH2D[iloc] = true;
     }
 
-    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
-                        &d_plist->nsci, &d_plist->sci_nalloc,
-                        h_plist->nsci,
-                        stream, true);
-
-    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
-                        &d_plist->ncj4, &d_plist->cj4_nalloc,
-                        h_plist->ncj4,
-                        stream, true);
-
-    /* this call only allocates space on the device (no data is transferred) */
-    cu_realloc_buffered((void **)&d_plist->imask, NULL, sizeof(*d_plist->imask),
-                        &d_plist->nimask, &d_plist->imask_nalloc,
-                        h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
-                        stream, true);
-
-    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
-                        &d_plist->nexcl, &d_plist->excl_nalloc,
-                        h_plist->nexcl,
-                        stream, true);
+    Context context = nullptr;
+
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index c155d5bdba..97e83e87cb 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -48,6 +48,8 @@
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdlib/nbnxn_gpu_types_common.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index cb11376666..32ebc57a4e 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -94,69 +94,6 @@ bool useLjCombRule(int vdwType)
             vdwType == evdwOclCUTCOMBLB);
 }
 
-/*! \brief Reallocation device buffers
- *
- *  Reallocation of the memory pointed by d_ptr and copying of the data from
- *  the location pointed by h_src host-side pointer is done. Allocation is
- *  buffered and therefore freeing is only needed if the previously allocated
- *  space is not enough.
- *  The H2D copy is launched in command queue s and can be done synchronously or
- *  asynchronously (the default is the latter).
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying the H2D copy. The event can further be used to queue a wait
- *  for this operation or to query profiling information.
- *  OpenCL equivalent of cu_realloc_buffered.
- */
-static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
-                                 size_t type_size,
-                                 int *curr_size, int *curr_alloc_size,
-                                 int req_size,
-                                 cl_context context,
-                                 cl_command_queue s,
-                                 bool bAsync = true,
-                                 cl_event *copy_event = NULL)
-{
-    if (d_dest == NULL || req_size < 0)
-    {
-        return;
-    }
-
-    /* reallocate only if the data does not fit = allocation size is smaller
-       than the current requested size */
-    if (req_size > *curr_alloc_size)
-    {
-        cl_int gmx_unused cl_error;
-
-        /* only free if the array has already been initialized */
-        if (*curr_alloc_size >= 0)
-        {
-            freeDeviceBuffer(d_dest);
-        }
-
-        *curr_alloc_size = over_alloc_large(req_size);
-
-        *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors, check clCreateBuffer flags
-    }
-
-    /* size could have changed without actual reallocation */
-    *curr_size = req_size;
-
-    /* upload to device */
-    if (h_src)
-    {
-        if (bAsync)
-        {
-            ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
-        }
-        else
-        {
-            ocl_copy_H2D_sync(*d_dest, h_src,  0, *curr_size * type_size, s);
-        }
-    }
-}
-
 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
  * and the table GPU array.
  *
@@ -886,30 +823,29 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
         nb->timers->didPairlistH2D[iloc] = true;
     }
 
-    ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
-                         &d_plist->nsci, &d_plist->sci_nalloc,
-                         h_plist->nsci,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
-
-    ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
-                         &d_plist->ncj4, &d_plist->cj4_nalloc,
-                         h_plist->ncj4,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
-
-    /* this call only allocates space on the device (no data is transferred) - no timing as well! */
-    ocl_realloc_buffered(&d_plist->imask, NULL, sizeof(unsigned int),
-                         &d_plist->nimask, &d_plist->imask_nalloc,
-                         h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
-                         nb->dev_rundata->context,
-                         stream, true);
-
-    ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
-                         &d_plist->nexcl, &d_plist->excl_nalloc,
-                         h_plist->nexcl,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+    // TODO most of this function is same in CUDA and OpenCL, move into the header
+    Context context = nb->dev_rundata->context;
+
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index 6b0ee88976..45bff6cd52 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -45,6 +45,7 @@
 #ifndef NBNXN_OPENCL_TYPES_H
 #define NBNXN_OPENCL_TYPES_H
 
+#include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gmxopencl.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/gpu_utils/oclutils.h"
-- 
2.22.0