pmeGpu->programHandle_ = pmeGpuProgram;
pme_gpu_init_internal(pmeGpu);
- pme_gpu_init_sync_events(pmeGpu);
pme_gpu_alloc_energy_virial(pmeGpu);
pme_gpu_copy_common_data_from(pme);
pme_gpu_free_grids(pmeGpu);
pme_gpu_destroy_3dfft(pmeGpu);
- pme_gpu_destroy_sync_events(pmeGpu);
/* Free the GPU-framework specific data last */
pme_gpu_destroy_specific(pmeGpu);
*/
void pme_gpu_destroy_specific(const PmeGpu *pmeGpu);
-/*! \libinternal \brief
- * Initializes the PME GPU synchronization events.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_init_sync_events(const PmeGpu *pmeGpu);
-
-/*! \libinternal \brief
- * Destroys the PME GPU synchronization events.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu);
-
/*! \libinternal \brief
* Initializes the CUDA FFT structures.
*
copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid,
0, pmeGpu->archSpecific->realGridSize,
pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
- cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
- CU_RET_ERR(stat, "PME spread grid sync event record failure");
+ pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
}
void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu)
{
- cudaError_t stat = cudaEventSynchronize(pmeGpu->archSpecific->syncSpreadGridD2H);
- CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
+ pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
}
void pme_gpu_init_internal(PmeGpu *pmeGpu)
CU_RET_ERR(stat, "PME cudaStreamDestroy error");
}
-void pme_gpu_init_sync_events(const PmeGpu *pmeGpu)
-{
- const auto eventFlags = cudaEventDisableTiming;
- CU_RET_ERR(cudaEventCreateWithFlags(&pmeGpu->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
-}
-
-void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu)
-{
- CU_RET_ERR(cudaEventDestroy(pmeGpu->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
-}
-
void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
{
if (pme_gpu_performs_FFT(pmeGpu))
#include <array>
#include <set>
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+
#include "pme-gpu-constants.h"
#include "pme-gpu-internal.h"
#include "pme-gpu-types.h"
/* Synchronization events */
/*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
- cudaEvent_t syncSpreadGridD2H;
+ GpuEventSynchronizer syncSpreadGridD2H;
// TODO: consider moving some things below into the non-CUDA struct.
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Implements a GpuEventSynchronizer class for CUDA.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
+#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
+
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will succeed with CUDA but fail with OpenCL.
+ *
+ * Another possible mode of operation can be implemented if needed:
+ * multiple calls to waitForEvent() after a single markEvent().
+ * For this, only some small alterations to gpueventsynchronizer_ocl.h need to be made.
+ * This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCL.
+ */
+class GpuEventSynchronizer
+{
+ public:
+ GpuEventSynchronizer()
+ {
+ cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
+ GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaEventCreate failed");
+ }
+ ~GpuEventSynchronizer()
+ {
+ cudaError_t stat = cudaEventDestroy(event_);
+ GMX_ASSERT(stat == cudaSuccess, "cudaEventDestroy failed");
+ }
+ //! No copying
+ GpuEventSynchronizer(const GpuEventSynchronizer &) = delete;
+ //! No assignment
+ GpuEventSynchronizer &operator = (GpuEventSynchronizer &&) = delete;
+ //! Moving is disabled but can be considered in the future if needed
+ GpuEventSynchronizer(GpuEventSynchronizer &&) = delete;
+
+ /*! \brief Marks the synchronization point in the \p stream.
+ * Should be followed by waitForEvent().
+ */
+ inline void markEvent(CommandStream stream)
+ {
+ cudaError_t stat = cudaEventRecord(event_, stream);
+ GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
+ }
+ /*! \brief Synchronizes the host thread on the marked event. */
+ inline void waitForEvent()
+ {
+ cudaError_t stat = cudaEventSynchronize(event_);
+ GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
+ }
+
+ private:
+ cudaEvent_t event_;
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Implements a GpuEventSynchronizer class for OpenCL.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
+ *
+ * Another possible mode of operation can be implemented if needed:
+ * multiple calls to waitForEvent() after a single markEvent(). For this, clReleaseEvent() call
+ * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
+ * the GMX_ASSERT(). This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCl.
+ */
+class GpuEventSynchronizer
+{
+ public:
+ //! A constructor
+ GpuEventSynchronizer() : event_(nullptr){}
+ //! A destructor
+ ~GpuEventSynchronizer()
+ {
+ // This additional code only prevents cl_event leak in an unlikely situation of destructor
+ // being called after markEvent() but before waitForEvent().
+ if (event_)
+ {
+ ensureReferenceCount(event_, 1);
+ clReleaseEvent(event_);
+ }
+ }
+ //! No copying
+ GpuEventSynchronizer(const GpuEventSynchronizer &) = delete;
+ //! No assignment
+ GpuEventSynchronizer &operator=(GpuEventSynchronizer &&) = delete;
+ //! Moving is disabled but can be considered in the future if needed
+ GpuEventSynchronizer(GpuEventSynchronizer &&) = delete;
+
+ /*! \brief Marks the synchronization point in the \p stream.
+ * Should be called first and then followed by waitForEvent().
+ */
+ inline void markEvent(CommandStream stream)
+ {
+ GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
+ cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_);
+ if (CL_SUCCESS != clError)
+ {
+ GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: " + ocl_get_error_string(clError)));
+ }
+ }
+ /*! \brief Synchronizes the host thread on the marked event. */
+ inline void waitForEvent()
+ {
+ cl_int clError = clWaitForEvents(1, &event_);
+ if (CL_SUCCESS != clError)
+ {
+ GMX_THROW(gmx::InternalError("Failed to synchronize on the GPU event: " + ocl_get_error_string(clError)));
+ }
+
+ // Reference count can't be checked after the event's released, it seems (segfault on NVIDIA).
+ ensureReferenceCount(event_, 1);
+ clError = clReleaseEvent(event_);
+ if (CL_SUCCESS != clError)
+ {
+ GMX_THROW(gmx::InternalError("Failed to release the GPU event: " + ocl_get_error_string(clError)));
+ }
+ event_ = nullptr;
+ }
+
+ private:
+ cl_event event_;
+};
+
+#endif
("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
}
+//! A debug checker to track cl_events being released correctly
+inline void ensureReferenceCount(const cl_event &event, unsigned int refCount)
+{
+#ifndef NDEBUG
+ cl_int clError = clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(refCount), &refCount, nullptr);
+ GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+ GMX_ASSERT(refCount == refCount, "Unexpected reference count");
+#else
+ GMX_UNUSED_VALUE(event);
+ GMX_UNUSED_VALUE(refCount);
+#endif
+}
+
/*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
*
* \param[in] s queue to check