Add GpuEventSynchronizer class

author Aleksei Iupinov <a.yupinov@gmail.com>

Mon, 16 Apr 2018 15:54:07 +0000 (17:54 +0200)

committer Aleksei Iupinov <a.yupinov@gmail.com>

Thu, 31 May 2018 09:17:35 +0000 (11:17 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Mon, 16 Apr 2018 15:54:07 +0000 (17:54 +0200)
committer Aleksei Iupinov <a.yupinov@gmail.com>
Thu, 31 May 2018 09:17:35 +0000 (11:17 +0200)
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index c3fc34281b73d8bcd0683e4b6f6fa3e0c98f4529..17a792eaaf0ba2affa545eadce65a60b1726e81a 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -241,7 +241,6 @@ static void pme_gpu_init(gmx_pme_t          *pme,
      pmeGpu->programHandle_ = pmeGpuProgram;
  
      pme_gpu_init_internal(pmeGpu);
-    pme_gpu_init_sync_events(pmeGpu);
      pme_gpu_alloc_energy_virial(pmeGpu);
  
      pme_gpu_copy_common_data_from(pme);
@@ -373,7 +372,6 @@ void pme_gpu_destroy(PmeGpu *pmeGpu)
      pme_gpu_free_grids(pmeGpu);
  
      pme_gpu_destroy_3dfft(pmeGpu);
-    pme_gpu_destroy_sync_events(pmeGpu);
  
      /* Free the GPU-framework specific data last */
      pme_gpu_destroy_specific(pmeGpu);
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index 4a3549a265b6e104340e1583036ce56bed49c9d6..ccd2797b049e9474b495370e5520bf7531cecaac 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -344,20 +344,6 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu);
   */
  void pme_gpu_destroy_specific(const PmeGpu *pmeGpu);
  
-/*! \libinternal \brief
- * Initializes the PME GPU synchronization events.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_init_sync_events(const PmeGpu *pmeGpu);
-
-/*! \libinternal \brief
- * Destroys the PME GPU synchronization events.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu);
-
  /*! \libinternal \brief
   * Initializes the CUDA FFT structures.
   *
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index 415982f319cee043956f651e80022259ecfd07fa..5ba1e4fdc7068abb5f7ef076578f0de39d8aacce 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -384,8 +384,7 @@ void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
      copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid,
                           0, pmeGpu->archSpecific->realGridSize,
                           pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
-    cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "PME spread grid sync event record failure");
+    pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
  }
  
  void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
@@ -434,8 +433,7 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
  
  void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu)
  {
-    cudaError_t stat = cudaEventSynchronize(pmeGpu->archSpecific->syncSpreadGridD2H);
-    CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
+    pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
  }
  
  void pme_gpu_init_internal(PmeGpu *pmeGpu)
@@ -482,17 +480,6 @@ void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
      CU_RET_ERR(stat, "PME cudaStreamDestroy error");
  }
  
-void pme_gpu_init_sync_events(const PmeGpu *pmeGpu)
-{
-    const auto  eventFlags = cudaEventDisableTiming;
-    CU_RET_ERR(cudaEventCreateWithFlags(&pmeGpu->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
-}
-
-void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu)
-{
-    CU_RET_ERR(cudaEventDestroy(pmeGpu->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
-}
-
  void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
  {
      if (pme_gpu_performs_FFT(pmeGpu))
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index bff9164e0bce571ec40fe2417a5fdeecfef4f1b1..42ebb1463563fa0c172a83d253840ae99ef22ad6 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -50,6 +50,8 @@
  #include <array>
  #include <set>
  
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+
  #include "pme-gpu-constants.h"
  #include "pme-gpu-internal.h"
  #include "pme-gpu-types.h"
@@ -166,7 +168,7 @@ struct PmeGpuCuda
  
      /* Synchronization events */
      /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
-    cudaEvent_t syncSpreadGridD2H;
+    GpuEventSynchronizer syncSpreadGridD2H;
  
      // TODO: consider moving some things below into the non-CUDA struct.
  
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh

new file mode 100644 (file)

index 0000000..c7495d1
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -0,0 +1,99 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Implements a GpuEventSynchronizer class for CUDA.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
+#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
+
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will succeed with CUDA but fail with OpenCL.
+ *
+ * Another possible mode of operation can be implemented if needed:
+ * multiple calls to waitForEvent() after a single markEvent().
+ * For this, only some small alterations to gpueventsynchronizer_ocl.h need to be made.
+ * This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCL.
+ */
+class GpuEventSynchronizer
+{
+    public:
+        GpuEventSynchronizer()
+        {
+            cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
+            GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaEventCreate failed");
+        }
+        ~GpuEventSynchronizer()
+        {
+            cudaError_t stat = cudaEventDestroy(event_);
+            GMX_ASSERT(stat == cudaSuccess, "cudaEventDestroy failed");
+        }
+        //! No copying
+        GpuEventSynchronizer(const GpuEventSynchronizer &)       = delete;
+        //! No assignment
+        GpuEventSynchronizer &operator = (GpuEventSynchronizer &&) = delete;
+        //! Moving is disabled but can be considered in the future if needed
+        GpuEventSynchronizer(GpuEventSynchronizer &&)            = delete;
+
+        /*! \brief Marks the synchronization point in the \p stream.
+         * Should be followed by waitForEvent().
+         */
+        inline void markEvent(CommandStream stream)
+        {
+            cudaError_t stat = cudaEventRecord(event_, stream);
+            GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
+        }
+        /*! \brief Synchronizes the host thread on the marked event. */
+        inline void waitForEvent()
+        {
+            cudaError_t stat = cudaEventSynchronize(event_);
+            GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
+        }
+
+    private:
+        cudaEvent_t event_;
+};
+
+#endif
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h

new file mode 100644 (file)

index 0000000..3f31384
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
@@ -0,0 +1,120 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Implements a GpuEventSynchronizer class for OpenCL.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
+ *
+ * Another possible mode of operation can be implemented if needed:
+ * multiple calls to waitForEvent() after a single markEvent(). For this, clReleaseEvent() call
+ * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
+ * the GMX_ASSERT(). This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCl.
+ */
+class GpuEventSynchronizer
+{
+    public:
+        //! A constructor
+        GpuEventSynchronizer() : event_(nullptr){}
+        //! A destructor
+        ~GpuEventSynchronizer()
+        {
+            // This additional code only prevents cl_event leak in an unlikely situation of destructor
+            // being called after markEvent() but before waitForEvent().
+            if (event_)
+            {
+                ensureReferenceCount(event_, 1);
+                clReleaseEvent(event_);
+            }
+        }
+        //! No copying
+        GpuEventSynchronizer(const GpuEventSynchronizer &)       = delete;
+        //! No assignment
+        GpuEventSynchronizer &operator=(GpuEventSynchronizer &&) = delete;
+        //! Moving is disabled but can be considered in the future if needed
+        GpuEventSynchronizer(GpuEventSynchronizer &&)            = delete;
+
+        /*! \brief Marks the synchronization point in the \p stream.
+         * Should be called first and then followed by waitForEvent().
+         */
+        inline void markEvent(CommandStream stream)
+        {
+            GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
+            cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_);
+            if (CL_SUCCESS != clError)
+            {
+                GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: " + ocl_get_error_string(clError)));
+            }
+        }
+        /*! \brief Synchronizes the host thread on the marked event. */
+        inline void waitForEvent()
+        {
+            cl_int clError = clWaitForEvents(1, &event_);
+            if (CL_SUCCESS != clError)
+            {
+                GMX_THROW(gmx::InternalError("Failed to synchronize on the GPU event: " + ocl_get_error_string(clError)));
+            }
+
+            // Reference count can't be checked after the event's released, it seems (segfault on NVIDIA).
+            ensureReferenceCount(event_, 1);
+            clError = clReleaseEvent(event_);
+            if (CL_SUCCESS != clError)
+            {
+                GMX_THROW(gmx::InternalError("Failed to release the GPU event: " + ocl_get_error_string(clError)));
+            }
+            event_ = nullptr;
+        }
+
+    private:
+        cl_event event_;
+};
+
+#endif
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index ef7de9aa54dc60ac84e24ab651cdd0ef83d748b3..5ed6617cdf48c4b15257e51789094d836f666c87 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -167,6 +167,19 @@ static inline void gpuStreamSynchronize(cl_command_queue s)
                         ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
  }
  
+//! A debug checker to track cl_events being released correctly
+inline void ensureReferenceCount(const cl_event &event, unsigned int refCount)
+{
+#ifndef NDEBUG
+    cl_int clError = clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(refCount), &refCount, nullptr);
+    GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+    GMX_ASSERT(refCount == refCount, "Unexpected reference count");
+#else
+    GMX_UNUSED_VALUE(event);
+    GMX_UNUSED_VALUE(refCount);
+#endif
+}
+
  /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
   *
   * \param[in] s queue to check
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Mon, 16 Apr 2018 15:54:07 +0000 (17:54 +0200)
committer	Aleksei Iupinov <a.yupinov@gmail.com>
	Thu, 31 May 2018 09:17:35 +0000 (11:17 +0200)
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer.cuh	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history