DeviceBuffer headers are added

author Aleksei Iupinov <a.yupinov@gmail.com>

Tue, 13 Feb 2018 12:08:13 +0000 (13:08 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 23 Apr 2018 09:22:47 +0000 (11:22 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Tue, 13 Feb 2018 12:08:13 +0000 (13:08 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 23 Apr 2018 09:22:47 +0000 (11:22 +0200)
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index acafc3b94cd8539dcf7d637db5839a42218a6c1b..e1c3ce14ff2b29cb6d8aa7e53ecaa660953e5b3a 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -48,6 +48,7 @@
  #include "pme.h"
  
  #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu

index e466ef97290d6062a1ace68ac0e50abad77f0a22..347f85392950bffd22b1a2326cb0a47929b7998c 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,6 +41,7 @@
  #include <cstdlib>
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index a91df33e2d7bdf310cb0885a3c7acd38afafc50d..5a05e3b1fcc0b821d808ee11c83bababc2458610 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -294,21 +294,4 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s)
      return true;
  }
  
-/*! \brief Free a device-side buffer.
- * This does not reset separately stored size/capacity integers,
- * as this is planned to be a destructor of DeviceBuffer as a proper class,
- * and no calls on \p buffer should be made afterwards.
- *
- * \param[in] buffer  Pointer to the buffer to free.
- */
-template <typename DeviceBuffer>
-void freeDeviceBuffer(DeviceBuffer *buffer)
-{
-    GMX_ASSERT(buffer, "needs a buffer pointer");
-    if (*buffer)
-    {
-        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
-    }
-}
-
  #endif
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

new file mode 100644 (file)

index 0000000..9268dba
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -0,0 +1,144 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_CUH
+#define GMX_GPU_UTILS_DEVICEBUFFER_CUH
+
+/*! \libinternal \file
+ *  \brief Implements the DeviceBuffer type and routines for CUDA.
+ *  Should only be included directly by the main DeviceBuffer file devicebuffer.h.
+ *  TODO: the intent is for DeviceBuffer to become a class.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+//! \brief A device-side buffer of ValueTypes
+template<typename ValueType>
+using DeviceBuffer = ValueType *;
+
+/*! \brief
+ * Allocates a device-side buffer.
+ * It is currently a caller's responsibility to call it only on not-yet allocated buffers.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer.
+ * \param[in]     numValues            Number of values to accomodate.
+ * \param[in]     context              The buffer's dummy context - not managed explicitly in CUDA RT.
+ */
+template <typename ValueType>
+void allocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                          size_t                   numValues,
+                          Context                  /* context */)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    cudaError_t stat = cudaMalloc((void **)buffer, numValues * sizeof(ValueType));
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaMalloc failure");
+}
+
+/*! \brief
+ * Frees a device-side buffer.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
+ *
+ * \param[in] buffer  Pointer to the buffer to free.
+ */
+template <typename DeviceBuffer>
+void freeDeviceBuffer(DeviceBuffer *buffer)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    if (*buffer)
+    {
+        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
+    }
+}
+
+/*! \brief
+ * Performs the host-to-device data copy, synchronous or asynchronously on request.
+ *
+ * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer
+ * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
+ * \param[in]     startingValueIndex   Offset (in values) at the device-side buffer to copy into.
+ * \param[in]     numValues            Number of values to copy.
+ * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     transferKind         Copy type: synchronous or asynchronous.
+ * \param[out]    timingEvent          A dummy pointer to the H2D copy timing event to be filled in.
+ *                                     Not used in CUDA implementation.
+ */
+template <typename ValueType>
+void copyToDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                        const ValueType         *hostBuffer,
+                        size_t                   startingValueIndex,
+                        size_t                   numValues,
+                        CommandStream            stream,
+                        GpuApiCallBehavior       transferKind,
+                        CommandEvent             */*timingEvent*/)
+{
+    if (numValues == 0)
+    {
+        return; // such calls are actually made with empty domains
+    }
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
+    cudaError_t  stat;
+    const size_t bytes = numValues * sizeof(ValueType);
+
+    switch (transferKind)
+    {
+        case GpuApiCallBehavior::Async:
+            GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA");
+            stat = cudaMemcpyAsync(*((ValueType **)buffer) + startingValueIndex, hostBuffer, bytes, cudaMemcpyHostToDevice, stream);
+            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(*((ValueType **)buffer) + startingValueIndex, hostBuffer, bytes, cudaMemcpyHostToDevice);
+            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous H2D copy failed");
+            break;
+
+        default:
+            throw;
+    }
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/devicebuffer.h b/src/gromacs/gpu_utils/devicebuffer.h

new file mode 100644 (file)

index 0000000..bfd0a7f
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer.h
@@ -0,0 +1,104 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_H
+#define GMX_GPU_UTILS_DEVICEBUFFER_H
+
+/*! \libinternal \file
+ *  \brief Implements the logic for handling of DeviceBuffer types in OpenCL/CUDA.
+ *  Can only be included on GPU build paths.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "config.h"
+
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h" // TODO: this is only for over_alloc_large
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/devicebuffer.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/devicebuffer_ocl.h"
+#else
+// cppcheck-suppress preprocessorErrorDirective
+#error "devicebuffer.h included on non-GPU build!"
+#endif
+
+/*! \brief
+ *  Reallocates the device-side buffer.
+ *
+ *  Reallocates the device-side memory pointed by \p buffer.
+ *  Allocation is buffered and therefore freeing is only needed
+ *  if the previously allocated space is not enough.
+ *  \p currentNumValues and \p currentMaxNumValues are updated.
+ *  TODO: \p currentNumValues, \p currentMaxNumValues, \p context
+ *  should all be encapsulated in a host-side class together with the buffer.
+ *
+ *  \tparam        ValueType            Raw value type of the \p buffer.
+ *  \param[in,out] buffer               Pointer to the device-side buffer
+ *  \param[in]     numValues            Number of values to accomodate.
+ *  \param[in,out] currentNumValues     The pointer to the buffer's number of values.
+ *  \param[in,out] currentMaxNumValues  The pointer to the buffer's capacity.
+ *  \param[in]     context              The buffer's context.
+ */
+template <typename ValueType>
+void reallocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                            size_t                   numValues,
+                            int                     *currentNumValues,
+                            int                     *currentMaxNumValues,
+                            Context                  context)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(currentNumValues, "needs a size pointer");
+    GMX_ASSERT(currentMaxNumValues, "needs a capacity pointer");
+
+    /* reallocate only if the data does not fit */
+    if (static_cast<int>(numValues) > *currentMaxNumValues)
+    {
+        if (*currentMaxNumValues >= 0)
+        {
+            freeDeviceBuffer(buffer);
+        }
+
+        *currentMaxNumValues = over_alloc_large(numValues);
+        allocateDeviceBuffer(buffer, *currentMaxNumValues, context);
+    }
+    /* size could have changed without actual reallocation */
+    *currentNumValues = numValues;
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h

new file mode 100644 (file)

index 0000000..fb0fbcc
--- /dev/null
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -0,0 +1,163 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICEBUFFER_OCL_H
+#define GMX_GPU_UTILS_DEVICEBUFFER_OCL_H
+
+/*! \libinternal \file
+ *  \brief Implements the DeviceBuffer type and routines for OpenCL.
+ *  Should only be included directly by the main DeviceBuffer file devicebuffer.h.
+ *  TODO: the intent is for DeviceBuffer to become a class.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A minimal cl_mem wrapper that remembers its allocation type.
+ * The only point is making template type deduction possible.
+ */
+template<typename ValueType>
+class TypedClMemory
+{
+    private:
+        //! \brief Underlying data - not nulled right here only because we still have some snew()'s around
+        cl_mem data_;
+    public:
+        //! \brief An assignment operator - the purpose is to make allocation/zeroing work
+        void operator=(cl_mem data){data_ = data; }
+        //! \brief Returns underlying cl_mem transparently
+        operator cl_mem() {return data_; }
+};
+
+//! \libinternal \brief A device-side buffer of ValueTypes
+template<typename ValueType>
+using DeviceBuffer = TypedClMemory<ValueType>;
+
+/*! \libinternal \brief
+ * Allocates a device-side buffer.
+ * It is currently a caller's responsibility to call it only on not-yet allocated buffers.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer.
+ * \param[in]     numValues            Number of values to accomodate.
+ * \param[in]     context              The buffer's context-to-be.
+ */
+template <typename ValueType>
+void allocateDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                          size_t                   numValues,
+                          Context                  context)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    void  *hostPtr = nullptr;
+    cl_int clError;
+    *buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, numValues * sizeof(ValueType), hostPtr, &clError);
+    GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "clCreateBuffer failure");
+}
+
+/*! \brief
+ * Frees a device-side buffer.
+ * This does not reset separately stored size/capacity integers,
+ * as this is planned to be a destructor of DeviceBuffer as a proper class,
+ * and no calls on \p buffer should be made afterwards.
+ *
+ * \param[in] buffer  Pointer to the buffer to free.
+ */
+template <typename DeviceBuffer>
+void freeDeviceBuffer(DeviceBuffer *buffer)
+{
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    if (*buffer)
+    {
+        GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
+    }
+}
+
+/*! \brief
+ * Performs the host-to-device data copy, synchronous or asynchronously on request.
+ *
+ * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ *
+ * \tparam        ValueType            Raw value type of the \p buffer.
+ * \param[in,out] buffer               Pointer to the device-side buffer
+ * \param[in]     hostBuffer           Pointer to the raw host-side memory, also typed \p ValueType
+ * \param[in]     startingValueIndex   Offset (in values) at the device-side buffer to copy into.
+ * \param[in]     numValues            Number of values to copy.
+ * \param[in]     stream               GPU stream to perform asynchronous copy in.
+ * \param[in]     transferKind         Copy type: synchronous or asynchronous.
+ * \param[out]    timingEvent          A pointer to the H2D copy timing event to be filled in.
+ *                                     If the pointer is not null, the event can further be used
+ *                                     to queue a wait for this operation or to query profiling information.
+ */
+template <typename ValueType>
+void copyToDeviceBuffer(DeviceBuffer<ValueType> *buffer,
+                        const ValueType         *hostBuffer,
+                        size_t                   startingValueIndex,
+                        size_t                   numValues,
+                        CommandStream            stream,
+                        GpuApiCallBehavior       transferKind,
+                        CommandEvent            *timingEvent)
+{
+    if (numValues == 0)
+    {
+        return; // such calls are actually made with empty domains
+    }
+    GMX_ASSERT(buffer, "needs a buffer pointer");
+    GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
+    cl_int       clError;
+    const size_t offset = startingValueIndex * sizeof(ValueType);
+    const size_t bytes  = numValues * sizeof(ValueType);
+    switch (transferKind)
+    {
+        case GpuApiCallBehavior::Async:
+            clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0, nullptr, timingEvent);
+            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous H2D copy failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0, nullptr, timingEvent);
+            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous H2D copy failed");
+            break;
+
+        default:
+            throw;
+    }
+}
+
+#endif
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index 7660d7cc249315dd5b2571a89c01ab862ae106c0..c5e367884dca489e5aa38b9d4ec1412d2e38e625 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -137,7 +137,7 @@ static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
      }
  }
  
-bool isHostMemoryPinned(void *h_ptr)
+bool isHostMemoryPinned(const void *h_ptr)
  {
      cudaPointerAttributes memoryAttributes;
      cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index 9634c380343e97ea0b0531dbcdf9c70627f78bb0..35921e84465d9eb13f45da387b6fe26eaa6927d9 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -271,6 +271,6 @@ void stopGpuProfiler(void) CUDA_FUNC_TERM
  
  //! Tells whether the host buffer was pinned for non-blocking transfers. Only implemented for CUDA.
  CUDA_FUNC_QUALIFIER
-bool isHostMemoryPinned(void *CUDA_FUNC_ARGUMENT(h_ptr)) CUDA_FUNC_TERM_WITH_RETURN(false)
+bool isHostMemoryPinned(const void *CUDA_FUNC_ARGUMENT(h_ptr)) CUDA_FUNC_TERM_WITH_RETURN(false)
  
  #endif
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh

index 17321f7c62381f964033759a47c9b6af6ea774d5..393bd7f7263b1583ce23534c76c0176f3dcb28a6 100644 (file)
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -102,6 +102,15 @@ class GpuRegionTimerImpl
  
          /*! \brief Resets internal state */
          inline void reset(){}
+
+        /*! \brief Returns a new raw timing event
+         * for passing into individual GPU API calls.
+         * This is just a dummy in CUDA.
+         */
+        inline CommandEvent *fetchNextEvent()
+        {
+            return nullptr;
+        }
  };
  
  //! Short-hand for external use
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh

index 9e96830a202949635af0eeac6aefc70d060386d7..323f80eb9ad087a42909e47739313e6c51ae7760 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -46,8 +46,7 @@
  using CommandStream = cudaStream_t;
  //! \brief Single GPU call timing event - meaningless in CUDA
  using CommandEvent  = void;
-//! \brief A device-side buffer of ValueTypes
-template<typename ValueType>
-using DeviceBuffer  = ValueType *;
+//! \brief Context used explicitly in OpenCL, does nothing in CUDA
+using Context       = void *;
  
  #endif
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h

index 9d8575788d77cfe9c93d557f0b14a3f54cb638c9..adf6f9508f6f43fc144e126e78f5dc95932355e0 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -48,8 +48,7 @@
  using CommandStream = cl_command_queue;
  //! \brief Single GPU call timing event
  using CommandEvent  = cl_event;
-//! \brief A device-side buffer of ValueTypes
-template<typename ValueType>
-using DeviceBuffer  = cl_mem;
+//! \brief Context used explicitly in OpenCL
+using Context       = cl_context;
  
  #endif
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 093fe31e19f61fc708bf595278742209955c969b..cc71d888f4e17d672d4a84e475b9ddea4e4ad7ae 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -177,21 +177,4 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
      return false;
  }
  
-/*! \brief Free a device-side buffer.
- * This does not reset separately stored size/capacity integers,
- * as this is planned to be a destructor of DeviceBuffer as a proper class,
- * and no calls on \p buffer should be made afterwards.
- *
- * \param[in] buffer  Pointer to the buffer to free.
- */
-template <typename DeviceBuffer>
-void freeDeviceBuffer(DeviceBuffer *buffer)
-{
-    GMX_ASSERT(buffer, "needs a buffer pointer");
-    if (*buffer)
-    {
-        GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
-    }
-}
-
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index ea9b3b8fe4e616059cfe05c793f7c6a8167d9f84..e6c8e531adca000842f0d55915e7ca992ffd97ce 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -557,26 +557,28 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
          nb->timers->didPairlistH2D[iloc] = true;
      }
  
-    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
-                        &d_plist->nsci, &d_plist->sci_nalloc,
-                        h_plist->nsci,
-                        stream, true);
-
-    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
-                        &d_plist->ncj4, &d_plist->cj4_nalloc,
-                        h_plist->ncj4,
-                        stream, true);
-
-    /* this call only allocates space on the device (no data is transferred) */
-    cu_realloc_buffered((void **)&d_plist->imask, NULL, sizeof(*d_plist->imask),
-                        &d_plist->nimask, &d_plist->imask_nalloc,
-                        h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
-                        stream, true);
-
-    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
-                        &d_plist->nexcl, &d_plist->excl_nalloc,
-                        h_plist->nexcl,
-                        stream, true);
+    Context context = nullptr;
+
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index c155d5bdba6bc02ec97f6cd9ebb5219868f54994..97e83e87cb7303b608852d81020ccbcb00725737 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -48,6 +48,8 @@
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/mdlib/nbnxn_consts.h"
  #include "gromacs/mdlib/nbnxn_gpu_types_common.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index cb11376666ab5ad12ba61e2f3ad945742403b7d4..32ebc57a4e1bebc369fda1e84d47ca9daf1ad9d7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -94,69 +94,6 @@ bool useLjCombRule(int vdwType)
              vdwType == evdwOclCUTCOMBLB);
  }
  
-/*! \brief Reallocation device buffers
- *
- *  Reallocation of the memory pointed by d_ptr and copying of the data from
- *  the location pointed by h_src host-side pointer is done. Allocation is
- *  buffered and therefore freeing is only needed if the previously allocated
- *  space is not enough.
- *  The H2D copy is launched in command queue s and can be done synchronously or
- *  asynchronously (the default is the latter).
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying the H2D copy. The event can further be used to queue a wait
- *  for this operation or to query profiling information.
- *  OpenCL equivalent of cu_realloc_buffered.
- */
-static void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
-                                 size_t type_size,
-                                 int *curr_size, int *curr_alloc_size,
-                                 int req_size,
-                                 cl_context context,
-                                 cl_command_queue s,
-                                 bool bAsync = true,
-                                 cl_event *copy_event = NULL)
-{
-    if (d_dest == NULL || req_size < 0)
-    {
-        return;
-    }
-
-    /* reallocate only if the data does not fit = allocation size is smaller
-       than the current requested size */
-    if (req_size > *curr_alloc_size)
-    {
-        cl_int gmx_unused cl_error;
-
-        /* only free if the array has already been initialized */
-        if (*curr_alloc_size >= 0)
-        {
-            freeDeviceBuffer(d_dest);
-        }
-
-        *curr_alloc_size = over_alloc_large(req_size);
-
-        *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors, check clCreateBuffer flags
-    }
-
-    /* size could have changed without actual reallocation */
-    *curr_size = req_size;
-
-    /* upload to device */
-    if (h_src)
-    {
-        if (bAsync)
-        {
-            ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
-        }
-        else
-        {
-            ocl_copy_H2D_sync(*d_dest, h_src,  0, *curr_size * type_size, s);
-        }
-    }
-}
-
  /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
   * and the table GPU array.
   *
@@ -886,30 +823,29 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
          nb->timers->didPairlistH2D[iloc] = true;
      }
  
-    ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
-                         &d_plist->nsci, &d_plist->sci_nalloc,
-                         h_plist->nsci,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
-
-    ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
-                         &d_plist->ncj4, &d_plist->cj4_nalloc,
-                         h_plist->ncj4,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
-
-    /* this call only allocates space on the device (no data is transferred) - no timing as well! */
-    ocl_realloc_buffered(&d_plist->imask, NULL, sizeof(unsigned int),
-                         &d_plist->nimask, &d_plist->imask_nalloc,
-                         h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
-                         nb->dev_rundata->context,
-                         stream, true);
-
-    ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
-                         &d_plist->nexcl, &d_plist->excl_nalloc,
-                         h_plist->nexcl,
-                         nb->dev_rundata->context,
-                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+    // TODO most of this function is same in CUDA and OpenCL, move into the header
+    Context context = nb->dev_rundata->context;
+
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+                       stream, GpuApiCallBehavior::Async,
+                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index 6b0ee889760e1869d12d69795f564622dc90e5b0..45bff6cd52f3a45b8dfcb29944ab1ac01767b09e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -45,6 +45,7 @@
  #ifndef NBNXN_OPENCL_TYPES_H
  #define NBNXN_OPENCL_TYPES_H
  
+#include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gmxopencl.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Tue, 13 Feb 2018 12:08:13 +0000 (13:08 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 23 Apr 2018 09:22:47 +0000 (11:22 +0200)
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer.cuh	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/devicebuffer.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/devicebuffer_ocl.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpuregiontimer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gputraits.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gputraits_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h		patch \| blob \| history