From: Andrey Alekseenko Date: Tue, 14 Sep 2021 13:40:18 +0000 (+0000) Subject: Add advanced event consumption accounting to GpuEventSynchronizer X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=2436419e537701ae0d611c4c9a3ff5d5665e72fa;p=alexxy%2Fgromacs.git Add advanced event consumption accounting to GpuEventSynchronizer --- diff --git a/src/gromacs/gpu_utils/device_event.cuh b/src/gromacs/gpu_utils/device_event.cuh index 41029d7e80..fee0237046 100644 --- a/src/gromacs/gpu_utils/device_event.cuh +++ b/src/gromacs/gpu_utils/device_event.cuh @@ -52,7 +52,7 @@ class DeviceEvent { public: - DeviceEvent() + DeviceEvent() : isMarked_(false) { cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming); if (stat != cudaSuccess) @@ -77,6 +77,7 @@ public: { GMX_THROW(gmx::InternalError("cudaEventRecord failed: " + gmx::getDeviceErrorString(stat))); } + isMarked_ = true; } //! Synchronizes the host thread on the marked event. inline void wait() @@ -97,6 +98,8 @@ public: } return (stat == cudaSuccess); } + //! Check if this event was marked + inline bool isMarked() const { return isMarked_; } //! Enqueues a wait for the recorded event in stream \p stream inline void enqueueWait(const DeviceStream& deviceStream) { @@ -106,11 +109,12 @@ public: GMX_THROW(gmx::InternalError("cudaStreamWaitEvent failed: " + gmx::getDeviceErrorString(stat))); } } - //! Reset the event (not needed in CUDA) - inline void reset() {} + //! Reset the event + inline void reset() { isMarked_ = false; } private: cudaEvent_t event_; + bool isMarked_; }; #endif diff --git a/src/gromacs/gpu_utils/device_event.h b/src/gromacs/gpu_utils/device_event.h index 2d52a2caba..1082b913aa 100644 --- a/src/gromacs/gpu_utils/device_event.h +++ b/src/gromacs/gpu_utils/device_event.h @@ -94,8 +94,8 @@ public: { GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); } - //! Checks whether this object encapsulates an underlying event. - inline bool isMarked() // NOLINT readability-convert-member-functions-to-static + //! Checks whether the underlying event was marked. + inline bool isMarked() const // NOLINT readability-convert-member-functions-to-static { GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build")); } diff --git a/src/gromacs/gpu_utils/device_event_ocl.h b/src/gromacs/gpu_utils/device_event_ocl.h index 99bd5a43d7..2942c1f38b 100644 --- a/src/gromacs/gpu_utils/device_event_ocl.h +++ b/src/gromacs/gpu_utils/device_event_ocl.h @@ -123,7 +123,7 @@ public: } //! Checks whether this object encapsulates an underlying event. - inline bool isMarked() { return event_ != sc_nullEvent; } + inline bool isMarked() const { return event_ != sc_nullEvent; } //! Reset (release) the event to unmarked state. inline void reset() diff --git a/src/gromacs/gpu_utils/device_event_sycl.h b/src/gromacs/gpu_utils/device_event_sycl.h index f750776bcd..f5228130f3 100644 --- a/src/gromacs/gpu_utils/device_event_sycl.h +++ b/src/gromacs/gpu_utils/device_event_sycl.h @@ -85,7 +85,8 @@ public: { # if GMX_SYCL_HIPSYCL // Relies on HIPSYCL_EXT_QUEUE_WAIT_LIST extension - events_ = deviceStream.stream().get_wait_list(); + events_ = deviceStream.stream().get_wait_list(); + isMarked_ = true; # else // Relies on SYCL_INTEL_enqueue_barrier events_ = { deviceStream.stream().submit_barrier() }; @@ -132,13 +133,33 @@ public: } //! Checks whether this object encapsulates an underlying event. - inline bool isMarked() { return !events_.empty(); } + inline bool isMarked() const + { +# if GMX_SYCL_HIPSYCL + return isMarked_; +# else + return !events_.empty(); +# endif + } //! Reset the event to unmarked state. - inline void reset() { events_.clear(); } + inline void reset() + { + events_.clear(); +# if GMX_SYCL_HIPSYCL + isMarked_ = false; +# endif + } private: std::vector events_; +# if GMX_SYCL_HIPSYCL + /*! \brief Flag to track event marking in hipSYCL. + * + * In hipSYCL, we can have empty \ref events_ after marking if there were no pending tasks in + * the queue. So, we use an explicit flag to check the event state. */ + bool isMarked_ = false; +# endif /*! \brief Dev. setting to no-op enqueueWait * * In SYCL, dependencies between the GPU tasks are managed by the runtime, so manual diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.h b/src/gromacs/gpu_utils/gpueventsynchronizer.h index d88dba533f..d79e107e06 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer.h @@ -53,17 +53,50 @@ /*! \libinternal \brief * A class which allows for CPU thread to mark and wait for certain GPU stream execution point. - * The event can be put into the stream with \ref markEvent() and then later waited on with \ref waitForEvent(). - * This can be repeated as necessary, but the current implementation does not allow waiting on - * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent(). - * The class generally attempts to track the correctness of its state transitions, but - * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA. + * + * The event can be put into the stream with \ref markEvent and then later waited on with \ref + * waitForEvent or \ref enqueueWaitEvent. + * + * Additionally, this class offers facilities for runtime checking of correctness by counting + * how many times each marked event is used as a synchronization point. + * + * - When the class is constructed, a required minimal (\c minConsumptionCount) and maximal (\c maxConsumptionCount) number of + * consumptions can be specified. By default, both are set to 1. + * - The event is considered fully consumed if its current number of consumptions \c c equals + * \c maxConsumptionCount. + * - The event is considered sufficiently consumed if minConsumptionCount <= c <= maxConsumptionCount. + * - The class is initialized in the fully consumed state, so it can not be consumed right away. + * - Consuming the event is only possible if it is not fully consumed (c < maxConsumptionCount). + * Consuming the event increments \c c by 1. Trying to consume fully consumed event + * throws \ref gmx::InternalError. + * - \ref reset returns object into the initial fully consumed state. + * This function is intended to manually override the consumption limits. + * - \ref consume \em consumes the event, without doing anything else. + * This function is intended to manually override the consumption limits. + * - \ref markEvent enqueues new event into the provided stream, and sets \c to 0. Marking is only + * possible if the event is sufficiently consumed, otherwise \ref gmx::InternalError + * is thrown. + * - \ref waitForEvent \em consumes the event and blocks the host thread until the event + * is ready (complete). + * - \ref enqueueWaitEvent \em consumes the event and blocks the inserts a blocking barrier + * into the provided stream which blocks the execution of all tasks later submitted to this + * stream until the event is ready (completes). + * + * Default minConsumptionCount=maxConsumptionCount=1 limits mean that each call to \ref markEvent must be followed + * by exactly one \ref enqueueWaitEvent or \ref enqueueWaitEvent. This is the recommended pattern + * for most use cases. By providing other constructor arguments, this requirement can be relaxed + * as needed. */ class GpuEventSynchronizer { public: //! A constructor - GpuEventSynchronizer() = default; + GpuEventSynchronizer(int minConsumptionCount, int maxConsumptionCount) : + minConsumptionCount_(minConsumptionCount), maxConsumptionCount_(maxConsumptionCount) + { + reset(); + } + GpuEventSynchronizer() : GpuEventSynchronizer(1, 1) {} //! A destructor ~GpuEventSynchronizer() = default; //! Remove copy assignment, because we can not copy the underlying event object. @@ -75,72 +108,93 @@ public: //! Remove move constructor, because we don't allow moving the underlying event object. GpuEventSynchronizer(GpuEventSynchronizer&&) = delete; - /*! \brief Marks the synchronization point in the \p stream. - * Should be called first and then followed by \ref waitForEvent(). + /*! \brief Marks the synchronization point in the \p stream and reset the consumption counter. + * + * Should be called before implicitly consuming actions (\ref waitForEvent() or \ref enqueueWaitEvent()) are executed or explicit \ref consume() calls are made. + * + * If the event has been marked before and not fully consumed, throws \ref gmx::InternalError. */ inline void markEvent(const DeviceStream& deviceStream) { #if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA - if (event_.isMarked()) + if (consumptionCount_ < minConsumptionCount_) { - GMX_THROW(gmx::InternalError("Trying to mark event before first consuming it")); + GMX_THROW(gmx::InternalError("Trying to mark event before fully consuming it")); } #endif event_.mark(deviceStream); + consumptionCount_ = 0; } - /*! \brief Synchronizes the host thread on the marked event. */ + /*! \brief Synchronizes the host thread on the marked event. + * + * Consumes the event if able, otherwise throws \ref gmx::InternalError. + */ inline void waitForEvent() { -#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA - if (!event_.isMarked()) - { - GMX_THROW(gmx::InternalError( - "Trying to wait for event before marking it or after fully consuming it")); - } -#endif + consume(); event_.wait(); - reset(); + resetIfFullyConsumed(); } - /*! \brief Checks the completion of the underlying event and resets the object if it was. */ + //! Checks the completion of the underlying event and consumes the event if it is ready. inline bool isReady() { -#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA - if (!event_.isMarked()) - { - GMX_THROW(gmx::InternalError("Trying to check the status of event before marking it")); - } -#endif bool isReady = event_.isReady(); if (isReady) { - reset(); + consume(); + resetIfFullyConsumed(); } return isReady; } - /*! \brief Enqueues a wait for the recorded event in stream \p stream + //! Checks whether the event was marked (and was not reset since then). + inline bool isMarked() const { return event_.isMarked(); } + /*! \brief Manually consume the event without waiting for it. * - * After enqueue, the associated event is released, so this method should - * be only called once per \ref markEvent() call (not enforced in CUDA yet). + * If the event is already fully consumed, throws \ref gmx::InternalError. */ - inline void enqueueWaitEvent(const DeviceStream& deviceStream) + inline void consume() { #if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA - if (!event_.isMarked()) + if (consumptionCount_ >= maxConsumptionCount_) { - GMX_THROW( - gmx::InternalError("Trying to enqueue wait for event before marking it or " - "after fully consuming it")); + GMX_THROW(gmx::InternalError( + "Trying to consume an event before marking it or after fully consuming it")); } #endif + consumptionCount_++; + } + //! Helper function to reset the event when it is fully consumed. + inline void resetIfFullyConsumed() + { + if (consumptionCount_ == maxConsumptionCount_) + { + event_.reset(); + } + } + /*! \brief Enqueues a wait for the recorded event in stream \p deviceStream. + * + * Consumes the event if able, otherwise throws \ref gmx::InternalError. + */ + inline void enqueueWaitEvent(const DeviceStream& deviceStream) + { + consume(); event_.enqueueWait(deviceStream); - reset(); + resetIfFullyConsumed(); } //! Resets the event to unmarked state, releasing the underlying event object if needed. - inline void reset() { event_.reset(); } + inline void reset() + { + // Set such that we can mark new event without triggering an exception, but can not consume. + consumptionCount_ = maxConsumptionCount_; + event_.reset(); + } private: DeviceEvent event_; + int consumptionCount_; + int minConsumptionCount_; + int maxConsumptionCount_; }; #endif diff --git a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp index dc1412c170..317ba72c7f 100644 --- a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp +++ b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp @@ -86,6 +86,16 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality) gpuEventSynchronizer.waitForEvent(); // Should return immediately } + { + SCOPED_TRACE("isMarked"); + GpuEventSynchronizer gpuEventSynchronizer; + EXPECT_FALSE(gpuEventSynchronizer.isMarked()); + gpuEventSynchronizer.markEvent(streamA); + EXPECT_TRUE(gpuEventSynchronizer.isMarked()); + gpuEventSynchronizer.reset(); + EXPECT_FALSE(gpuEventSynchronizer.isMarked()); + } + { SCOPED_TRACE("Mark and enqueueWait"); GpuEventSynchronizer gpuEventSynchronizer; @@ -121,6 +131,33 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality) gpuEventSynchronizer.waitForEvent(); EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError); } + { + SCOPED_TRACE("Marking before consuming"); + GpuEventSynchronizer gpuEventSynchronizer; + gpuEventSynchronizer.markEvent(streamA); + EXPECT_THROW(gpuEventSynchronizer.markEvent(streamB), gmx::InternalError); + } + { + SCOPED_TRACE("Wait trice after marking"); + GpuEventSynchronizer gpuEventSynchronizer(0, 2); + gpuEventSynchronizer.markEvent(streamA); + gpuEventSynchronizer.waitForEvent(); + gpuEventSynchronizer.waitForEvent(); + EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError); + } + { + SCOPED_TRACE("Allowed underconsume"); + GpuEventSynchronizer gpuEventSynchronizer(0, 1); + gpuEventSynchronizer.markEvent(streamA); + gpuEventSynchronizer.markEvent(streamB); + } + { + SCOPED_TRACE("Forbidden underconsume"); + GpuEventSynchronizer gpuEventSynchronizer(2, 2); + gpuEventSynchronizer.markEvent(streamA); + gpuEventSynchronizer.waitForEvent(); + EXPECT_THROW(gpuEventSynchronizer.markEvent(streamB), gmx::InternalError); + } # endif } }