From 8f6ea9eb6cf1f862d1223e9929a0e6fb75e6053e Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Wed, 18 Aug 2021 08:52:57 +0000 Subject: [PATCH] GpuEventSynchronizer: extract backend-specific functionality No new functionality or added checks. Preparatory work for adding advanced event accounting logic to GpuEventSynchronizer. DeviceEvent class contains only backend-specific functions and minimal sanity checks. The more advanced logic (the one we're trying to fix in #3988) is kept in GpuEventSynchronizer, and currently left unchanged (1:1 for OpenCL and SYCL, very relaxed rules for CUDA). Refs #2527, #3988. --- src/gromacs/gpu_utils/device_event.cuh | 118 ++++++++++++++++ src/gromacs/gpu_utils/device_event.h | 119 +++++++++++++++++ ...tsynchronizer_ocl.h => device_event_ocl.h} | 119 ++++++++--------- ...ynchronizer_sycl.h => device_event_sycl.h} | 108 ++++++--------- .../gpu_utils/gpueventsynchronizer.cuh | 126 ------------------ src/gromacs/gpu_utils/gpueventsynchronizer.h | 110 +++++++++++++-- .../gpu_utils/tests/gpueventsynchronizer.cpp | 22 +++ 7 files changed, 455 insertions(+), 267 deletions(-) create mode 100644 src/gromacs/gpu_utils/device_event.cuh create mode 100644 src/gromacs/gpu_utils/device_event.h rename src/gromacs/gpu_utils/{gpueventsynchronizer_ocl.h => device_event_ocl.h} (55%) rename src/gromacs/gpu_utils/{gpueventsynchronizer_sycl.h => device_event_sycl.h} (60%) delete mode 100644 src/gromacs/gpu_utils/gpueventsynchronizer.cuh diff --git a/src/gromacs/gpu_utils/device_event.cuh b/src/gromacs/gpu_utils/device_event.cuh new file mode 100644 index 0000000000..e2fe4c577a --- /dev/null +++ b/src/gromacs/gpu_utils/device_event.cuh @@ -0,0 +1,118 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Implements a DeviceEvent class for CUDA. + * + * \author Aleksei Iupinov + * \author Andrey Alekseenko + * \inlibraryapi + */ +#ifndef GMX_GPU_UTILS_DEVICE_EVENT_CUH +#define GMX_GPU_UTILS_DEVICE_EVENT_CUH + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/utility/gmxassert.h" + +#ifndef DOXYGEN + +class DeviceEvent +{ +public: + DeviceEvent() + { + cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError("cudaEventCreate failed: " + gmx::getDeviceErrorString(stat))); + } + } + ~DeviceEvent() { cudaEventDestroy(event_); } + // Disable copy, move, and assignment. Move can be allowed, but not needed yet. + DeviceEvent& operator=(const DeviceEvent&) = delete; + DeviceEvent(const DeviceEvent&) = delete; + DeviceEvent& operator=(DeviceEvent&&) = delete; + DeviceEvent(DeviceEvent&&) = delete; + + /*! \brief Marks the synchronization point in the \p stream. + * Should be followed by waitForEvent(). + */ + inline void mark(const DeviceStream& deviceStream) + { + cudaError_t stat = cudaEventRecord(event_, deviceStream.stream()); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError("cudaEventRecord failed: " + gmx::getDeviceErrorString(stat))); + } + } + //! Synchronizes the host thread on the marked event. + inline void wait() + { + cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError("cudaEventSynchronize failed: " + gmx::getDeviceErrorString(stat))); + } + } + //! Checks the completion of the underlying event. + inline bool isReady() + { + cudaError_t stat = cudaEventQuery(event_); + if (stat != cudaSuccess && stat != cudaErrorNotReady) + { + GMX_THROW(gmx::InternalError("cudaEventQuery failed: " + gmx::getDeviceErrorString(stat))); + } + return (stat == cudaSuccess); + } + //! Enqueues a wait for the recorded event in stream \p stream + inline void enqueueWait(const DeviceStream& deviceStream) + { + cudaError_t stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError("cudaStreamWaitEvent failed: " + gmx::getDeviceErrorString(stat))); + } + } + //! Reset the event (not needed in CUDA) + inline void reset() {} + +private: + cudaEvent_t event_; +}; + +#endif + +#endif \ No newline at end of file diff --git a/src/gromacs/gpu_utils/device_event.h b/src/gromacs/gpu_utils/device_event.h new file mode 100644 index 0000000000..9d479049f3 --- /dev/null +++ b/src/gromacs/gpu_utils/device_event.h @@ -0,0 +1,119 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Declares DeviceEvent for all build configuraitons + * + * This header may be included from any build configuration and + * defers valid GPU declarations to headers valid only in such + * build configurations. + * + * \author Mark Abraham + * \inlibraryapi + */ +#ifndef GMX_GPU_UTILS_DEVICE_EVENT_H +#define GMX_GPU_UTILS_DEVICE_EVENT_H + +#include "config.h" + +#include "gromacs/utility/exceptions.h" + +#if !GMX_GPU || defined(DOXYGEN) + +class DeviceStream; + +// [[noreturn]] attributes must be added to the methods, so it's +// easier to silence the warning here and avoid them appearing in +// the Doxygen +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wmissing-noreturn" + +class DeviceEvent +{ +public: + DeviceEvent() = default; + // Disable copy, move, and assignment. Move can be allowed, but not needed yet. + DeviceEvent& operator=(const DeviceEvent&) = delete; + DeviceEvent(const DeviceEvent&) = delete; + DeviceEvent& operator=(DeviceEvent&&) = delete; + DeviceEvent(DeviceEvent&&) = delete; + + /*! \brief Marks the synchronization point in the \p stream. + * Should be followed by waitForEvent(). + */ + inline void mark(const DeviceStream& /*deviceStream*/) // NOLINT readability-convert-member-functions-to-static + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } + //! Synchronizes the host thread on the marked event. + inline void wait() // NOLINT readability-convert-member-functions-to-static + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } + //! Checks the completion of the underlying event. + inline bool isReady() // NOLINT readability-convert-member-functions-to-static + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } + //! Enqueues a wait for the recorded event in stream \p stream + // NOLINTNEXTLINE readability-convert-member-functions-to-static + inline void enqueueWait(const DeviceStream& /*deviceStream*/) + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } + //! Checks whether this object encapsulates an underlying event. + inline bool isMarked() // NOLINT readability-convert-member-functions-to-static + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } + + //! Reset the event (not needed in CUDA) + // NOLINTNEXTLINE readability-convert-member-functions-to-static + inline void reset() // NOLINT readability-convert-member-functions-to-static + { + GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); + } +}; + +# pragma clang diagnostic pop + +#elif GMX_GPU_CUDA +# include "device_event.cuh" +#elif GMX_GPU_OPENCL +# include "device_event_ocl.h" +#elif GMX_GPU_SYCL +# include "device_event_sycl.h" +#endif + +#endif // GMX_GPU_UTILS_DEVICE_EVENT_H diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/device_event_ocl.h similarity index 55% rename from src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h rename to src/gromacs/gpu_utils/device_event_ocl.h index 0b9905450f..99bd5a43d7 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h +++ b/src/gromacs/gpu_utils/device_event_ocl.h @@ -33,62 +33,49 @@ * the research papers on the package. Check out http://www.gromacs.org. */ /*! \libinternal \file - * \brief Implements a GpuEventSynchronizer class for OpenCL. + * \brief Implements a DeviceEvent class for OpenCL. * * \author Aleksei Iupinov + * \author Andrey Alekseenko * \inlibraryapi */ -#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H -#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H +#ifndef GMX_GPU_UTILS_DEVICE_EVENT_OCL_H +#define GMX_GPU_UTILS_DEVICE_EVENT_OCL_H -#ifndef DOXYGEN +#include "gromacs/gpu_utils/gputraits_ocl.h" +#include "gromacs/gpu_utils/oclutils.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" -# include "gromacs/gpu_utils/gputraits_ocl.h" -# include "gromacs/gpu_utils/oclutils.h" -# include "gromacs/utility/exceptions.h" -# include "gromacs/utility/gmxassert.h" +#ifndef DOXYGEN -/*! \libinternal \brief - * A class which allows for CPU thread to mark and wait for certain GPU stream execution point. - * The event can be put into the stream with markEvent() and then later waited on with waitForEvent(). - * This can be repeated as necessary, but the current implementation does not allow waiting on - * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent(). - * The class generally attempts to track the correctness of its state transitions, but - * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA. - * - * Another possible mode of operation can be implemented if needed: - * multiple calls to waitForEvent() after a single markEvent(). For this, clReleaseEvent() call - * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing - * the GMX_ASSERT(). This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCl. - */ -class GpuEventSynchronizer +class DeviceEvent { public: //! A constructor - GpuEventSynchronizer() : event_(nullptr) {} + DeviceEvent() : event_(sc_nullEvent) {} + DeviceEvent(cl_event event) : event_(event) {} //! A destructor - ~GpuEventSynchronizer() + ~DeviceEvent() { - // This additional code only prevents cl_event leak in an unlikely situation of destructor - // being called after markEvent() but before waitForEvent() / enqueueWaitEvent(). - if (event_) + if (isMarked()) { + // Can not throw in destructor, so not checking for any error clReleaseEvent(event_); } } - //! No copying - GpuEventSynchronizer(const GpuEventSynchronizer&) = delete; - //! No assignment - GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete; - //! Moving is disabled but can be considered in the future if needed - GpuEventSynchronizer(GpuEventSynchronizer&&) = delete; + // Disable copy, move, and assignment. Move can be allowed, but not needed yet. + DeviceEvent& operator=(const DeviceEvent&) = delete; + DeviceEvent(const DeviceEvent&) = delete; + DeviceEvent& operator=(DeviceEvent&&) = delete; + DeviceEvent(DeviceEvent&&) = delete; /*! \brief Marks the synchronization point in the \p stream. - * Should be called first and then followed by waitForEvent(). + * Should be called first and then followed by wait(). */ - inline void markEvent(const DeviceStream& deviceStream) + inline void mark(const DeviceStream& deviceStream) { - GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!"); + reset(); cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_); if (CL_SUCCESS != clError) { @@ -96,21 +83,35 @@ public: + ocl_get_error_string(clError))); } } + /*! \brief Synchronizes the host thread on the marked event. */ - inline void waitForEvent() + inline void wait() { + GMX_RELEASE_ASSERT(isMarked(), "Can not wait for an unmarked event"); cl_int clError = clWaitForEvents(1, &event_); if (CL_SUCCESS != clError) { GMX_THROW(gmx::InternalError("Failed to synchronize on the GPU event: " + ocl_get_error_string(clError))); } + } - reset(); + /*! \brief Enqueues a wait for the recorded event in stream \p stream. */ + inline void enqueueWait(const DeviceStream& deviceStream) + { + GMX_RELEASE_ASSERT(isMarked(), "Can not enqueue an unmarked event"); + cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr); + if (CL_SUCCESS != clError) + { + GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: " + + ocl_get_error_string(clError))); + } } - /*! \brief Checks the completion of the underlying event and resets the object if it was. */ + + //! Checks the completion of the underlying event. inline bool isReady() { + GMX_RELEASE_ASSERT(isMarked(), "Can not check the status of unmarked event"); cl_int result; cl_int clError = clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &result, nullptr); @@ -118,44 +119,32 @@ public: { GMX_THROW(gmx::InternalError("Failed to retrieve event info: " + ocl_get_error_string(clError))); } - bool hasTriggered = (result == CL_COMPLETE); - if (hasTriggered) - { - reset(); - } - return hasTriggered; + return (result == CL_COMPLETE); } - /*! \brief Enqueues a wait for the recorded event in stream \p stream - * - * After enqueue, the associated event is released, so this method should - * be only called once per markEvent() call. - */ - inline void enqueueWaitEvent(const DeviceStream& deviceStream) - { - cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr); - if (CL_SUCCESS != clError) - { - GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: " - + ocl_get_error_string(clError))); - } - reset(); - } + //! Checks whether this object encapsulates an underlying event. + inline bool isMarked() { return event_ != sc_nullEvent; } //! Reset (release) the event to unmarked state. inline void reset() { - cl_int clError = clReleaseEvent(event_); - if (CL_SUCCESS != clError) + if (isMarked()) { - GMX_THROW(gmx::InternalError("Failed to release the GPU event: " - + ocl_get_error_string(clError))); + cl_int clError = clReleaseEvent(event_); + if (CL_SUCCESS != clError) + { + GMX_THROW(gmx::InternalError("Failed to release the GPU event: " + + ocl_get_error_string(clError))); + } } - event_ = nullptr; + event_ = sc_nullEvent; } private: cl_event event_; + + //! Magic value to indicate uninitialized state. + static constexpr cl_event sc_nullEvent = nullptr; }; #endif diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/device_event_sycl.h similarity index 60% rename from src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h rename to src/gromacs/gpu_utils/device_event_sycl.h index dd12a8bd7d..f750776bcd 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h +++ b/src/gromacs/gpu_utils/device_event_sycl.h @@ -47,10 +47,10 @@ * \author Andrey Alekseenko * \inlibraryapi */ -#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H -#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H +#ifndef GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H +#define GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H -#include +#include #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/gmxsycl.h" @@ -58,51 +58,31 @@ #include "gromacs/utility/gmxassert.h" #ifndef DOXYGEN -/*! \libinternal \brief - * A class which allows for CPU thread to mark and wait for certain GPU stream execution point. - * The event can be put into the stream with markEvent() and then later waited on with waitForEvent(). - * This can be repeated as necessary, but the current implementation does not allow waiting on - * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent(). - * The class generally attempts to track the correctness of its state transitions, but - * please note that calling waitForEvent() right after the construction will fail with OpenCL - * and SYCL but succeed with CUDA. - * - * Another possible mode of operation can be implemented if needed: - * multiple calls to waitForEvent() after a single markEvent(). For this, event.reset() call - * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing - * the GMX_ASSERT(). This was tested to work both with CUDA, NVidia OpenCL, and Intel SYCL, - * but not with AMD/Intel OpenCl. - * - * \warning This class is offered for uniformity with other GPU implementations, but expect it to - * be deprecated in the future. - * - */ -class GpuEventSynchronizer + +class DeviceEvent { public: //! A constructor. - GpuEventSynchronizer() + DeviceEvent() { doNotSynchronizeBetweenStreams_ = (std::getenv("GMX_GPU_SYCL_NO_SYNCHRONIZE") != nullptr); events_.reserve(1); } //! A constructor from an existing event. - GpuEventSynchronizer(const cl::sycl::event& event) : events_{ event } {} + DeviceEvent(const cl::sycl::event& event) : events_{ event } {} //! A destructor. - ~GpuEventSynchronizer() = default; - //! No copying - GpuEventSynchronizer(const GpuEventSynchronizer&) = delete; - //! No assignment - GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete; - //! Moving is disabled but can be considered in the future if needed - GpuEventSynchronizer(GpuEventSynchronizer&&) = delete; + ~DeviceEvent() = default; + // Disable copy, move, and assignment. They all can be allowed, but not needed yet. + DeviceEvent& operator=(const DeviceEvent&) = delete; + DeviceEvent(const DeviceEvent&) = delete; + DeviceEvent& operator=(DeviceEvent&&) = delete; + DeviceEvent(DeviceEvent&&) = delete; /*! \brief Marks the synchronization point in the \p deviceStream. - * Should be called first and then followed by waitForEvent() or enqueueWaitEvent(). + * Should be called first and then followed by wait() or enqueueWait(). */ - inline void markEvent(const DeviceStream& deviceStream) + inline void mark(const DeviceStream& deviceStream) { - GMX_ASSERT(!isMarked(), "Do not call markEvent more than once!"); # if GMX_SYCL_HIPSYCL // Relies on HIPSYCL_EXT_QUEUE_WAIT_LIST extension events_ = deviceStream.stream().get_wait_list(); @@ -111,39 +91,21 @@ public: events_ = { deviceStream.stream().submit_barrier() }; # endif } - /*! \brief Synchronizes the host thread on the marked event. - * As in the OpenCL implementation, the event is released. - */ - inline void waitForEvent() + + //! Synchronizes the host thread on the marked event. + inline void wait() { - GMX_ASSERT(isMarked(), "Don't call waitForEvent before marking the event!"); # if GMX_SYCL_DPCPP - GMX_ASSERT(events_.size() == 1, "One event expected in DPCPP, but we have several!"); + // Note: this is not to prevent use-before-marking, but for checking the DPC++ vs hipSYCL consistency + GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!"); # endif for (auto& event : events_) { event.wait_and_throw(); } - reset(); - } - /*! \brief Checks the completion of the underlying event and resets the object if it was. */ - inline bool isReady() - { - bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) { - auto info = event.get_info(); - bool isComplete = (info == cl::sycl::info::event_command_status::complete); - return isComplete; - }); - if (allReady) - { - reset(); - } - return allReady; } - /*! \brief Enqueues a wait for the recorded event in stream \p deviceStream. - * As in the OpenCL implementation, the event is released. - */ - inline void enqueueWaitEvent(const DeviceStream& deviceStream) + + inline void enqueueWait(const DeviceStream& deviceStream) { if (!doNotSynchronizeBetweenStreams_) { @@ -151,21 +113,33 @@ public: // Submit an empty kernel that depends on all the events recorded. deviceStream.stream().single_task(events_, [=]() {}); # else + GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!"); // Relies on SYCL_INTEL_enqueue_barrier extensions - GMX_ASSERT(events_.size() == 1, "Only one event expected in DPCPP!"); deviceStream.stream().submit_barrier(events_); # endif } - reset(); } + + //! Checks the completion of the underlying event. + inline bool isReady() + { + bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) { + auto info = event.get_info(); + bool isComplete = (info == cl::sycl::info::event_command_status::complete); + return isComplete; + }); + return allReady; + } + + //! Checks whether this object encapsulates an underlying event. + inline bool isMarked() { return !events_.empty(); } + //! Reset the event to unmarked state. inline void reset() { events_.clear(); } - //! Check if the event is marked. Needed for some workarounds for #3988 - inline bool isMarked() const { return !events_.empty(); } private: std::vector events_; - /*! \brief Dev. setting to no-op enqueueWaitEvent + /*! \brief Dev. setting to no-op enqueueWait * * In SYCL, dependencies between the GPU tasks are managed by the runtime, so manual * synchronization between GPU streams should be redundant, but we keep it on by default. @@ -176,6 +150,6 @@ private: bool doNotSynchronizeBetweenStreams_; }; -#endif // !defined DOXYGEN +#endif // DOXYGEN -#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H +#endif // GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh deleted file mode 100644 index f40c7d446f..0000000000 --- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh +++ /dev/null @@ -1,126 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -/*! \libinternal \file - * \brief Implements a GpuEventSynchronizer class for CUDA. - * - * \author Aleksei Iupinov - * \inlibraryapi - */ -#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH -#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH - -#include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/device_stream.h" -#include "gromacs/gpu_utils/gputraits.cuh" -#include "gromacs/utility/gmxassert.h" - -#ifndef DOXYGEN - -/*! \libinternal \brief - * A class which allows for CPU thread to mark and wait for certain GPU stream execution point. - * The event can be put into the stream with markEvent() and then later waited on with waitForEvent(). - * This can be repeated as necessary, but the current implementation does not allow waiting on - * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent(). - * The class generally attempts to track the correctness of its state transitions, but - * please note that calling waitForEvent() right after the construction will succeed with CUDA but fail with OpenCL. - * - * Another possible mode of operation can be implemented if needed: - * multiple calls to waitForEvent() after a single markEvent(). - * For this, only some small alterations to gpueventsynchronizer_ocl.h need to be made. - * This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCL. - */ -class GpuEventSynchronizer -{ -public: - GpuEventSynchronizer() - { - cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming); - GMX_RELEASE_ASSERT(stat == cudaSuccess, - ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str()); - } - ~GpuEventSynchronizer() - { - cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_); - GMX_RELEASE_ASSERT(stat == cudaSuccess, - ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str()); - } - //! No copying - GpuEventSynchronizer(const GpuEventSynchronizer&) = delete; - //! No assignment - GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete; - //! Moving is disabled but can be considered in the future if needed - GpuEventSynchronizer(GpuEventSynchronizer&&) = delete; - - /*! \brief Marks the synchronization point in the \p stream. - * Should be followed by waitForEvent(). - */ - inline void markEvent(const DeviceStream& deviceStream) - { - cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream()); - GMX_ASSERT(stat == cudaSuccess, - ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str()); - } - /*! \brief Synchronizes the host thread on the marked event. */ - inline void waitForEvent() - { - cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_); - GMX_ASSERT(stat == cudaSuccess, - ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str()); - } - /*! \brief Checks the completion of the underlying event and resets the object if it was. */ - inline bool isReady() - { - cudaError_t gmx_used_in_debug stat = cudaEventQuery(event_); - GMX_ASSERT((stat == cudaSuccess) || (stat == cudaErrorNotReady), - ("cudaEventQuery failed. " + gmx::getDeviceErrorString(stat)).c_str()); - return (stat == cudaSuccess); - } - /*! \brief Enqueues a wait for the recorded event in stream \p stream */ - inline void enqueueWaitEvent(const DeviceStream& deviceStream) - { - cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0); - GMX_ASSERT(stat == cudaSuccess, - ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str()); - } - //! Reset the event (not needed in CUDA) - inline void reset() {} - -private: - cudaEvent_t event_; -}; - -#endif - -#endif diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.h b/src/gromacs/gpu_utils/gpueventsynchronizer.h index 3045c34e38..d88dba533f 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer.h @@ -33,22 +33,114 @@ * the research papers on the package. Check out http://www.gromacs.org. */ /*! \libinternal \file - * \brief Implements a GpuEventSynchronizer class for CUDA. + * \brief Implements a GpuEventSynchronizer class. * + * \author Andrey Alekseenko + * \author Artem Zhmurov * \author Aleksei Iupinov - * \inlibraryapi + * \inlibraryapi */ #ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H #define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H #include "config.h" -#if GMX_GPU_CUDA -# include "gromacs/gpu_utils/gpueventsynchronizer.cuh" -#elif GMX_GPU_OPENCL -# include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h" -#elif GMX_GPU_SYCL -# include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" +#include "gromacs/utility/classhelpers.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" + +#include "device_event.h" + +/*! \libinternal \brief + * A class which allows for CPU thread to mark and wait for certain GPU stream execution point. + * The event can be put into the stream with \ref markEvent() and then later waited on with \ref waitForEvent(). + * This can be repeated as necessary, but the current implementation does not allow waiting on + * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent(). + * The class generally attempts to track the correctness of its state transitions, but + * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA. + */ +class GpuEventSynchronizer +{ +public: + //! A constructor + GpuEventSynchronizer() = default; + //! A destructor + ~GpuEventSynchronizer() = default; + //! Remove copy assignment, because we can not copy the underlying event object. + GpuEventSynchronizer& operator=(const GpuEventSynchronizer&) = delete; + //! Remove copy constructor, because we can not copy the underlying event object. + GpuEventSynchronizer(const GpuEventSynchronizer&) = delete; + //! Remove move assignment, because we don't allow moving the underlying event object. + GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete; + //! Remove move constructor, because we don't allow moving the underlying event object. + GpuEventSynchronizer(GpuEventSynchronizer&&) = delete; + + /*! \brief Marks the synchronization point in the \p stream. + * Should be called first and then followed by \ref waitForEvent(). + */ + inline void markEvent(const DeviceStream& deviceStream) + { +#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA + if (event_.isMarked()) + { + GMX_THROW(gmx::InternalError("Trying to mark event before first consuming it")); + } +#endif + event_.mark(deviceStream); + } + /*! \brief Synchronizes the host thread on the marked event. */ + inline void waitForEvent() + { +#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA + if (!event_.isMarked()) + { + GMX_THROW(gmx::InternalError( + "Trying to wait for event before marking it or after fully consuming it")); + } #endif + event_.wait(); + reset(); + } + /*! \brief Checks the completion of the underlying event and resets the object if it was. */ + inline bool isReady() + { +#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA + if (!event_.isMarked()) + { + GMX_THROW(gmx::InternalError("Trying to check the status of event before marking it")); + } +#endif + bool isReady = event_.isReady(); + if (isReady) + { + reset(); + } + return isReady; + } + /*! \brief Enqueues a wait for the recorded event in stream \p stream + * + * After enqueue, the associated event is released, so this method should + * be only called once per \ref markEvent() call (not enforced in CUDA yet). + */ + inline void enqueueWaitEvent(const DeviceStream& deviceStream) + { +#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA + if (!event_.isMarked()) + { + GMX_THROW( + gmx::InternalError("Trying to enqueue wait for event before marking it or " + "after fully consuming it")); + } +#endif + event_.enqueueWait(deviceStream); + reset(); + } + + //! Resets the event to unmarked state, releasing the underlying event object if needed. + inline void reset() { event_.reset(); } -#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H +private: + DeviceEvent event_; +}; + +#endif diff --git a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp index 7f28867762..dc1412c170 100644 --- a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp +++ b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp @@ -41,6 +41,8 @@ */ #include "gmxpre.h" +#include "config.h" + #include "gromacs/gpu_utils/gpueventsynchronizer.h" #include @@ -100,6 +102,26 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality) gpuEventSynchronizer.markEvent(streamB); gpuEventSynchronizer.waitForEvent(); } + +# if !GMX_GPU_CUDA // CUDA has very lax rules for event consumption. See Issues #2527 and #3988. + { + SCOPED_TRACE("Wait before marking"); + GpuEventSynchronizer gpuEventSynchronizer; + EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError); + } + { + SCOPED_TRACE("enqueueWait before marking"); + GpuEventSynchronizer gpuEventSynchronizer; + EXPECT_THROW(gpuEventSynchronizer.enqueueWaitEvent(streamA), gmx::InternalError); + } + { + SCOPED_TRACE("Wait twice after marking"); + GpuEventSynchronizer gpuEventSynchronizer; + gpuEventSynchronizer.markEvent(streamA); + gpuEventSynchronizer.waitForEvent(); + EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError); + } +# endif } } #endif // GMX_GPU -- 2.22.0