From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 14 Sep 2021 13:40:18 +0000 (+0000)
Subject: Add advanced event consumption accounting to GpuEventSynchronizer
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=2436419e537701ae0d611c4c9a3ff5d5665e72fa;p=alexxy%2Fgromacs.git

Add advanced event consumption accounting to GpuEventSynchronizer
---

diff --git a/src/gromacs/gpu_utils/device_event.cuh b/src/gromacs/gpu_utils/device_event.cuh
index 41029d7e80..fee0237046 100644
--- a/src/gromacs/gpu_utils/device_event.cuh
+++ b/src/gromacs/gpu_utils/device_event.cuh
@@ -52,7 +52,7 @@
 class DeviceEvent
 {
 public:
-    DeviceEvent()
+    DeviceEvent() : isMarked_(false)
     {
         cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
         if (stat != cudaSuccess)
@@ -77,6 +77,7 @@ public:
         {
             GMX_THROW(gmx::InternalError("cudaEventRecord failed: " + gmx::getDeviceErrorString(stat)));
         }
+        isMarked_ = true;
     }
     //! Synchronizes the host thread on the marked event.
     inline void wait()
@@ -97,6 +98,8 @@ public:
         }
         return (stat == cudaSuccess);
     }
+    //! Check if this event was marked
+    inline bool isMarked() const { return isMarked_; }
     //! Enqueues a wait for the recorded event in stream \p stream
     inline void enqueueWait(const DeviceStream& deviceStream)
     {
@@ -106,11 +109,12 @@ public:
             GMX_THROW(gmx::InternalError("cudaStreamWaitEvent failed: " + gmx::getDeviceErrorString(stat)));
         }
     }
-    //! Reset the event (not needed in CUDA)
-    inline void reset() {}
+    //! Reset the event
+    inline void reset() { isMarked_ = false; }
 
 private:
     cudaEvent_t event_;
+    bool        isMarked_;
 };
 
 #endif
diff --git a/src/gromacs/gpu_utils/device_event.h b/src/gromacs/gpu_utils/device_event.h
index 2d52a2caba..1082b913aa 100644
--- a/src/gromacs/gpu_utils/device_event.h
+++ b/src/gromacs/gpu_utils/device_event.h
@@ -94,8 +94,8 @@ public:
     {
         GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
     }
-    //! Checks whether this object encapsulates an underlying event.
-    inline bool isMarked() // NOLINT readability-convert-member-functions-to-static
+    //! Checks whether the underlying event was marked.
+    inline bool isMarked() const // NOLINT readability-convert-member-functions-to-static
     {
         GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
     }
diff --git a/src/gromacs/gpu_utils/device_event_ocl.h b/src/gromacs/gpu_utils/device_event_ocl.h
index 99bd5a43d7..2942c1f38b 100644
--- a/src/gromacs/gpu_utils/device_event_ocl.h
+++ b/src/gromacs/gpu_utils/device_event_ocl.h
@@ -123,7 +123,7 @@ public:
     }
 
     //! Checks whether this object encapsulates an underlying event.
-    inline bool isMarked() { return event_ != sc_nullEvent; }
+    inline bool isMarked() const { return event_ != sc_nullEvent; }
 
     //! Reset (release) the event to unmarked state.
     inline void reset()
diff --git a/src/gromacs/gpu_utils/device_event_sycl.h b/src/gromacs/gpu_utils/device_event_sycl.h
index f750776bcd..f5228130f3 100644
--- a/src/gromacs/gpu_utils/device_event_sycl.h
+++ b/src/gromacs/gpu_utils/device_event_sycl.h
@@ -85,7 +85,8 @@ public:
     {
 #    if GMX_SYCL_HIPSYCL
         // Relies on HIPSYCL_EXT_QUEUE_WAIT_LIST extension
-        events_ = deviceStream.stream().get_wait_list();
+        events_   = deviceStream.stream().get_wait_list();
+        isMarked_ = true;
 #    else
         // Relies on SYCL_INTEL_enqueue_barrier
         events_ = { deviceStream.stream().submit_barrier() };
@@ -132,13 +133,33 @@ public:
     }
 
     //! Checks whether this object encapsulates an underlying event.
-    inline bool isMarked() { return !events_.empty(); }
+    inline bool isMarked() const
+    {
+#    if GMX_SYCL_HIPSYCL
+        return isMarked_;
+#    else
+        return !events_.empty();
+#    endif
+    }
 
     //! Reset the event to unmarked state.
-    inline void reset() { events_.clear(); }
+    inline void reset()
+    {
+        events_.clear();
+#    if GMX_SYCL_HIPSYCL
+        isMarked_ = false;
+#    endif
+    }
 
 private:
     std::vector<cl::sycl::event> events_;
+#    if GMX_SYCL_HIPSYCL
+    /*! \brief Flag to track event marking in hipSYCL.
+     *
+     * In hipSYCL, we can have empty \ref events_ after marking if there were no pending tasks in
+     * the queue. So, we use an explicit flag to check the event state. */
+    bool isMarked_ = false;
+#    endif
     /*! \brief Dev. setting to no-op enqueueWait
      *
      * In SYCL, dependencies between the GPU tasks are managed by the runtime, so manual
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.h b/src/gromacs/gpu_utils/gpueventsynchronizer.h
index d88dba533f..d79e107e06 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.h
@@ -53,17 +53,50 @@
 
 /*! \libinternal \brief
  * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with \ref markEvent() and then later waited on with \ref waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
+ *
+ * The event can be put into the stream with \ref markEvent and then later waited on with \ref
+ * waitForEvent or \ref enqueueWaitEvent.
+ *
+ * Additionally, this class offers facilities for runtime checking of correctness by counting
+ * how many times each marked event is used as a synchronization point.
+ *
+ * - When the class is constructed, a required minimal (\c minConsumptionCount) and maximal (\c maxConsumptionCount) number of
+ * consumptions can be specified. By default, both are set to 1.
+ * - The event is considered <em>fully consumed</em> if its current number of consumptions \c c equals
+ * \c maxConsumptionCount.
+ * - The event is considered <em>sufficiently consumed</em> if <tt>minConsumptionCount <= c <= maxConsumptionCount</tt>.
+ * - The class is initialized in the <em>fully consumed</em> state, so it can not be consumed right away.
+ * - Consuming the event is only possible if it is not <em>fully consumed</em> (<tt>c < maxConsumptionCount</tt>).
+ * Consuming the event increments \c c by 1. Trying to consume <em>fully consumed</em> event
+ * throws \ref gmx::InternalError.
+ * - \ref reset returns object into the initial <em>fully consumed</em> state.
+ * This function is intended to manually override the consumption limits.
+ * - \ref consume \em consumes the event, without doing anything else.
+ * This function is intended to manually override the consumption limits.
+ * - \ref markEvent enqueues new event into the provided stream, and sets \c to 0. Marking is only
+ * possible if the event is <em>sufficiently consumed</em>, otherwise \ref gmx::InternalError
+ * is thrown.
+ * - \ref waitForEvent \em consumes the event and blocks the host thread until the event
+ * is ready (complete).
+ * - \ref enqueueWaitEvent \em consumes the event and blocks the inserts a blocking barrier
+ * into the provided stream which blocks the execution of all tasks later submitted to this
+ * stream until the event is ready (completes).
+ *
+ * Default <tt>minConsumptionCount=maxConsumptionCount=1</tt> limits mean that each call to \ref markEvent must be followed
+ * by exactly one \ref enqueueWaitEvent or \ref enqueueWaitEvent. This is the recommended pattern
+ * for most use cases. By providing other constructor arguments, this requirement can be relaxed
+ * as needed.
  */
 class GpuEventSynchronizer
 {
 public:
     //! A constructor
-    GpuEventSynchronizer() = default;
+    GpuEventSynchronizer(int minConsumptionCount, int maxConsumptionCount) :
+        minConsumptionCount_(minConsumptionCount), maxConsumptionCount_(maxConsumptionCount)
+    {
+        reset();
+    }
+    GpuEventSynchronizer() : GpuEventSynchronizer(1, 1) {}
     //! A destructor
     ~GpuEventSynchronizer() = default;
     //! Remove copy assignment, because we can not copy the underlying event object.
@@ -75,72 +108,93 @@ public:
     //! Remove move constructor, because we don't allow moving the underlying event object.
     GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
 
-    /*! \brief Marks the synchronization point in the \p stream.
-     * Should be called first and then followed by \ref waitForEvent().
+    /*! \brief Marks the synchronization point in the \p stream and reset the consumption counter.
+     *
+     * Should be called before implicitly consuming actions (\ref waitForEvent() or \ref enqueueWaitEvent()) are executed or explicit \ref consume() calls are made.
+     *
+     * If the event has been marked before and not fully consumed, throws \ref gmx::InternalError.
      */
     inline void markEvent(const DeviceStream& deviceStream)
     {
 #if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
-        if (event_.isMarked())
+        if (consumptionCount_ < minConsumptionCount_)
         {
-            GMX_THROW(gmx::InternalError("Trying to mark event before first consuming it"));
+            GMX_THROW(gmx::InternalError("Trying to mark event before fully consuming it"));
         }
 #endif
         event_.mark(deviceStream);
+        consumptionCount_ = 0;
     }
-    /*! \brief Synchronizes the host thread on the marked event. */
+    /*! \brief Synchronizes the host thread on the marked event.
+     *
+     * Consumes the event if able, otherwise throws \ref gmx::InternalError.
+     */
     inline void waitForEvent()
     {
-#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
-        if (!event_.isMarked())
-        {
-            GMX_THROW(gmx::InternalError(
-                    "Trying to wait for event before marking it or after fully consuming it"));
-        }
-#endif
+        consume();
         event_.wait();
-        reset();
+        resetIfFullyConsumed();
     }
-    /*! \brief Checks the completion of the underlying event and resets the object if it was. */
+    //! Checks the completion of the underlying event and consumes the event if it is ready.
     inline bool isReady()
     {
-#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
-        if (!event_.isMarked())
-        {
-            GMX_THROW(gmx::InternalError("Trying to check the status of event before marking it"));
-        }
-#endif
         bool isReady = event_.isReady();
         if (isReady)
         {
-            reset();
+            consume();
+            resetIfFullyConsumed();
         }
         return isReady;
     }
-    /*! \brief Enqueues a wait for the recorded event in stream \p stream
+    //! Checks whether the event was marked (and was not reset since then).
+    inline bool isMarked() const { return event_.isMarked(); }
+    /*! \brief Manually consume the event without waiting for it.
      *
-     *  After enqueue, the associated event is released, so this method should
-     *  be only called once per \ref markEvent() call (not enforced in CUDA yet).
+     * If the event is already fully consumed, throws \ref gmx::InternalError.
      */
-    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+    inline void consume()
     {
 #if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
-        if (!event_.isMarked())
+        if (consumptionCount_ >= maxConsumptionCount_)
         {
-            GMX_THROW(
-                    gmx::InternalError("Trying to enqueue wait for event before marking it or "
-                                       "after fully consuming it"));
+            GMX_THROW(gmx::InternalError(
+                    "Trying to consume an event before marking it or after fully consuming it"));
         }
 #endif
+        consumptionCount_++;
+    }
+    //! Helper function to reset the event when it is fully consumed.
+    inline void resetIfFullyConsumed()
+    {
+        if (consumptionCount_ == maxConsumptionCount_)
+        {
+            event_.reset();
+        }
+    }
+    /*! \brief Enqueues a wait for the recorded event in stream \p deviceStream.
+     *
+     * Consumes the event if able, otherwise throws \ref gmx::InternalError.
+     */
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+    {
+        consume();
         event_.enqueueWait(deviceStream);
-        reset();
+        resetIfFullyConsumed();
     }
 
     //! Resets the event to unmarked state, releasing the underlying event object if needed.
-    inline void reset() { event_.reset(); }
+    inline void reset()
+    {
+        // Set such that we can mark new event without triggering an exception, but can not consume.
+        consumptionCount_ = maxConsumptionCount_;
+        event_.reset();
+    }
 
 private:
     DeviceEvent event_;
+    int         consumptionCount_;
+    int         minConsumptionCount_;
+    int         maxConsumptionCount_;
 };
 
 #endif
diff --git a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
index dc1412c170..317ba72c7f 100644
--- a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
+++ b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
@@ -86,6 +86,16 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality)
             gpuEventSynchronizer.waitForEvent(); // Should return immediately
         }
 
+        {
+            SCOPED_TRACE("isMarked");
+            GpuEventSynchronizer gpuEventSynchronizer;
+            EXPECT_FALSE(gpuEventSynchronizer.isMarked());
+            gpuEventSynchronizer.markEvent(streamA);
+            EXPECT_TRUE(gpuEventSynchronizer.isMarked());
+            gpuEventSynchronizer.reset();
+            EXPECT_FALSE(gpuEventSynchronizer.isMarked());
+        }
+
         {
             SCOPED_TRACE("Mark and enqueueWait");
             GpuEventSynchronizer gpuEventSynchronizer;
@@ -121,6 +131,33 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality)
             gpuEventSynchronizer.waitForEvent();
             EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
         }
+        {
+            SCOPED_TRACE("Marking before consuming");
+            GpuEventSynchronizer gpuEventSynchronizer;
+            gpuEventSynchronizer.markEvent(streamA);
+            EXPECT_THROW(gpuEventSynchronizer.markEvent(streamB), gmx::InternalError);
+        }
+        {
+            SCOPED_TRACE("Wait trice after marking");
+            GpuEventSynchronizer gpuEventSynchronizer(0, 2);
+            gpuEventSynchronizer.markEvent(streamA);
+            gpuEventSynchronizer.waitForEvent();
+            gpuEventSynchronizer.waitForEvent();
+            EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
+        }
+        {
+            SCOPED_TRACE("Allowed underconsume");
+            GpuEventSynchronizer gpuEventSynchronizer(0, 1);
+            gpuEventSynchronizer.markEvent(streamA);
+            gpuEventSynchronizer.markEvent(streamB);
+        }
+        {
+            SCOPED_TRACE("Forbidden underconsume");
+            GpuEventSynchronizer gpuEventSynchronizer(2, 2);
+            gpuEventSynchronizer.markEvent(streamA);
+            gpuEventSynchronizer.waitForEvent();
+            EXPECT_THROW(gpuEventSynchronizer.markEvent(streamB), gmx::InternalError);
+        }
 #    endif
     }
 }