From 8f6ea9eb6cf1f862d1223e9929a0e6fb75e6053e Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Wed, 18 Aug 2021 08:52:57 +0000
Subject: [PATCH] GpuEventSynchronizer: extract backend-specific functionality

No new functionality or added checks.

Preparatory work for adding advanced event accounting logic to GpuEventSynchronizer.

DeviceEvent class contains only backend-specific functions and minimal sanity checks.
The more advanced logic (the one we're trying to fix in #3988) is kept in
GpuEventSynchronizer, and currently left unchanged (1:1 for OpenCL and SYCL,
very relaxed rules for CUDA).

Refs #2527, #3988.
---
 src/gromacs/gpu_utils/device_event.cuh        | 118 ++++++++++++++++
 src/gromacs/gpu_utils/device_event.h          | 119 +++++++++++++++++
 ...tsynchronizer_ocl.h => device_event_ocl.h} | 119 ++++++++---------
 ...ynchronizer_sycl.h => device_event_sycl.h} | 108 ++++++---------
 .../gpu_utils/gpueventsynchronizer.cuh        | 126 ------------------
 src/gromacs/gpu_utils/gpueventsynchronizer.h  | 110 +++++++++++++--
 .../gpu_utils/tests/gpueventsynchronizer.cpp  |  22 +++
 7 files changed, 455 insertions(+), 267 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/device_event.cuh
 create mode 100644 src/gromacs/gpu_utils/device_event.h
 rename src/gromacs/gpu_utils/{gpueventsynchronizer_ocl.h => device_event_ocl.h} (55%)
 rename src/gromacs/gpu_utils/{gpueventsynchronizer_sycl.h => device_event_sycl.h} (60%)
 delete mode 100644 src/gromacs/gpu_utils/gpueventsynchronizer.cuh

diff --git a/src/gromacs/gpu_utils/device_event.cuh b/src/gromacs/gpu_utils/device_event.cuh
new file mode 100644
index 0000000000..e2fe4c577a
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_event.cuh
@@ -0,0 +1,118 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Implements a DeviceEvent class for CUDA.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ *  \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_CUH
+#define GMX_GPU_UTILS_DEVICE_EVENT_CUH
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+#ifndef DOXYGEN
+
+class DeviceEvent
+{
+public:
+    DeviceEvent()
+    {
+        cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError("cudaEventCreate failed: " + gmx::getDeviceErrorString(stat)));
+        }
+    }
+    ~DeviceEvent() { cudaEventDestroy(event_); }
+    // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+    DeviceEvent& operator=(const DeviceEvent&) = delete;
+    DeviceEvent(const DeviceEvent&)            = delete;
+    DeviceEvent& operator=(DeviceEvent&&) = delete;
+    DeviceEvent(DeviceEvent&&)            = delete;
+
+    /*! \brief Marks the synchronization point in the \p stream.
+     * Should be followed by waitForEvent().
+     */
+    inline void mark(const DeviceStream& deviceStream)
+    {
+        cudaError_t stat = cudaEventRecord(event_, deviceStream.stream());
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError("cudaEventRecord failed: " + gmx::getDeviceErrorString(stat)));
+        }
+    }
+    //! Synchronizes the host thread on the marked event.
+    inline void wait()
+    {
+        cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError("cudaEventSynchronize failed: " + gmx::getDeviceErrorString(stat)));
+        }
+    }
+    //! Checks the completion of the underlying event.
+    inline bool isReady()
+    {
+        cudaError_t stat = cudaEventQuery(event_);
+        if (stat != cudaSuccess && stat != cudaErrorNotReady)
+        {
+            GMX_THROW(gmx::InternalError("cudaEventQuery failed: " + gmx::getDeviceErrorString(stat)));
+        }
+        return (stat == cudaSuccess);
+    }
+    //! Enqueues a wait for the recorded event in stream \p stream
+    inline void enqueueWait(const DeviceStream& deviceStream)
+    {
+        cudaError_t stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
+        if (stat != cudaSuccess)
+        {
+            GMX_THROW(gmx::InternalError("cudaStreamWaitEvent failed: " + gmx::getDeviceErrorString(stat)));
+        }
+    }
+    //! Reset the event (not needed in CUDA)
+    inline void reset() {}
+
+private:
+    cudaEvent_t event_;
+};
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/gromacs/gpu_utils/device_event.h b/src/gromacs/gpu_utils/device_event.h
new file mode 100644
index 0000000000..9d479049f3
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_event.h
@@ -0,0 +1,119 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declares DeviceEvent for all build configuraitons
+ *
+ *  This header may be included from any build configuration and
+ *  defers valid GPU declarations to headers valid only in such
+ *  build configurations.
+ *
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_H
+
+#include "config.h"
+
+#include "gromacs/utility/exceptions.h"
+
+#if !GMX_GPU || defined(DOXYGEN)
+
+class DeviceStream;
+
+// [[noreturn]] attributes must be added to the methods, so it's
+// easier to silence the warning here and avoid them appearing in
+// the Doxygen
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+class DeviceEvent
+{
+public:
+    DeviceEvent() = default;
+    // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+    DeviceEvent& operator=(const DeviceEvent&) = delete;
+    DeviceEvent(const DeviceEvent&)            = delete;
+    DeviceEvent& operator=(DeviceEvent&&) = delete;
+    DeviceEvent(DeviceEvent&&)            = delete;
+
+    /*! \brief Marks the synchronization point in the \p stream.
+     * Should be followed by waitForEvent().
+     */
+    inline void mark(const DeviceStream& /*deviceStream*/) // NOLINT readability-convert-member-functions-to-static
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+    //! Synchronizes the host thread on the marked event.
+    inline void wait() // NOLINT readability-convert-member-functions-to-static
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+    //! Checks the completion of the underlying event.
+    inline bool isReady() // NOLINT readability-convert-member-functions-to-static
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+    //! Enqueues a wait for the recorded event in stream \p stream
+    // NOLINTNEXTLINE readability-convert-member-functions-to-static
+    inline void enqueueWait(const DeviceStream& /*deviceStream*/)
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+    //! Checks whether this object encapsulates an underlying event.
+    inline bool isMarked() // NOLINT readability-convert-member-functions-to-static
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+
+    //! Reset the event (not needed in CUDA)
+    // NOLINTNEXTLINE readability-convert-member-functions-to-static
+    inline void reset() // NOLINT readability-convert-member-functions-to-static
+    {
+        GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+    }
+};
+
+#    pragma clang diagnostic pop
+
+#elif GMX_GPU_CUDA
+#    include "device_event.cuh"
+#elif GMX_GPU_OPENCL
+#    include "device_event_ocl.h"
+#elif GMX_GPU_SYCL
+#    include "device_event_sycl.h"
+#endif
+
+#endif // GMX_GPU_UTILS_DEVICE_EVENT_H
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/device_event_ocl.h
similarity index 55%
rename from src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
rename to src/gromacs/gpu_utils/device_event_ocl.h
index 0b9905450f..99bd5a43d7 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
+++ b/src/gromacs/gpu_utils/device_event_ocl.h
@@ -33,62 +33,49 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \libinternal \file
- *  \brief Implements a GpuEventSynchronizer class for OpenCL.
+ *  \brief Implements a DeviceEvent class for OpenCL.
  *
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Andrey Alekseenko <al42and@gmail.com>
  * \inlibraryapi
  */
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_OCL_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_OCL_H
 
-#ifndef DOXYGEN
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
 
-#    include "gromacs/gpu_utils/gputraits_ocl.h"
-#    include "gromacs/gpu_utils/oclutils.h"
-#    include "gromacs/utility/exceptions.h"
-#    include "gromacs/utility/gmxassert.h"
+#ifndef DOXYGEN
 
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent(). For this, clReleaseEvent() call
- * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
- * the GMX_ASSERT(). This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCl.
- */
-class GpuEventSynchronizer
+class DeviceEvent
 {
 public:
     //! A constructor
-    GpuEventSynchronizer() : event_(nullptr) {}
+    DeviceEvent() : event_(sc_nullEvent) {}
+    DeviceEvent(cl_event event) : event_(event) {}
     //! A destructor
-    ~GpuEventSynchronizer()
+    ~DeviceEvent()
     {
-        // This additional code only prevents cl_event leak in an unlikely situation of destructor
-        // being called after markEvent() but before waitForEvent() / enqueueWaitEvent().
-        if (event_)
+        if (isMarked())
         {
+            // Can not throw in destructor, so not checking for any error
             clReleaseEvent(event_);
         }
     }
-    //! No copying
-    GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
-    //! No assignment
-    GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
-    //! Moving is disabled but can be considered in the future if needed
-    GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+    // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+    DeviceEvent& operator=(const DeviceEvent&) = delete;
+    DeviceEvent(const DeviceEvent&)            = delete;
+    DeviceEvent& operator=(DeviceEvent&&) = delete;
+    DeviceEvent(DeviceEvent&&)            = delete;
 
     /*! \brief Marks the synchronization point in the \p stream.
-     * Should be called first and then followed by waitForEvent().
+     * Should be called first and then followed by wait().
      */
-    inline void markEvent(const DeviceStream& deviceStream)
+    inline void mark(const DeviceStream& deviceStream)
     {
-        GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
+        reset();
         cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_);
         if (CL_SUCCESS != clError)
         {
@@ -96,21 +83,35 @@ public:
                                          + ocl_get_error_string(clError)));
         }
     }
+
     /*! \brief Synchronizes the host thread on the marked event. */
-    inline void waitForEvent()
+    inline void wait()
     {
+        GMX_RELEASE_ASSERT(isMarked(), "Can not wait for an unmarked event");
         cl_int clError = clWaitForEvents(1, &event_);
         if (CL_SUCCESS != clError)
         {
             GMX_THROW(gmx::InternalError("Failed to synchronize on the GPU event: "
                                          + ocl_get_error_string(clError)));
         }
+    }
 
-        reset();
+    /*! \brief Enqueues a wait for the recorded event in stream \p stream. */
+    inline void enqueueWait(const DeviceStream& deviceStream)
+    {
+        GMX_RELEASE_ASSERT(isMarked(), "Can not enqueue an unmarked event");
+        cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
+        if (CL_SUCCESS != clError)
+        {
+            GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
+                                         + ocl_get_error_string(clError)));
+        }
     }
-    /*! \brief Checks the completion of the underlying event and resets the object if it was. */
+
+    //!  Checks the completion of the underlying event.
     inline bool isReady()
     {
+        GMX_RELEASE_ASSERT(isMarked(), "Can not check the status of unmarked event");
         cl_int result;
         cl_int clError = clGetEventInfo(
                 event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &result, nullptr);
@@ -118,44 +119,32 @@ public:
         {
             GMX_THROW(gmx::InternalError("Failed to retrieve event info: " + ocl_get_error_string(clError)));
         }
-        bool hasTriggered = (result == CL_COMPLETE);
-        if (hasTriggered)
-        {
-            reset();
-        }
-        return hasTriggered;
+        return (result == CL_COMPLETE);
     }
-    /*! \brief Enqueues a wait for the recorded event in stream \p stream
-     *
-     *  After enqueue, the associated event is released, so this method should
-     *  be only called once per markEvent() call.
-     */
-    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
-    {
-        cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
-        if (CL_SUCCESS != clError)
-        {
-            GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
-                                         + ocl_get_error_string(clError)));
-        }
 
-        reset();
-    }
+    //! Checks whether this object encapsulates an underlying event.
+    inline bool isMarked() { return event_ != sc_nullEvent; }
 
     //! Reset (release) the event to unmarked state.
     inline void reset()
     {
-        cl_int clError = clReleaseEvent(event_);
-        if (CL_SUCCESS != clError)
+        if (isMarked())
         {
-            GMX_THROW(gmx::InternalError("Failed to release the GPU event: "
-                                         + ocl_get_error_string(clError)));
+            cl_int clError = clReleaseEvent(event_);
+            if (CL_SUCCESS != clError)
+            {
+                GMX_THROW(gmx::InternalError("Failed to release the GPU event: "
+                                             + ocl_get_error_string(clError)));
+            }
         }
-        event_ = nullptr;
+        event_ = sc_nullEvent;
     }
 
 private:
     cl_event event_;
+
+    //! Magic value to indicate uninitialized state.
+    static constexpr cl_event sc_nullEvent = nullptr;
 };
 
 #endif
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/device_event_sycl.h
similarity index 60%
rename from src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
rename to src/gromacs/gpu_utils/device_event_sycl.h
index dd12a8bd7d..f750776bcd 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
+++ b/src/gromacs/gpu_utils/device_event_sycl.h
@@ -47,10 +47,10 @@
  *  \author Andrey Alekseenko <al42and@gmail.com>
  * \inlibraryapi
  */
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
 
-#include <optional>
+#include <vector>
 
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gmxsycl.h"
@@ -58,51 +58,31 @@
 #include "gromacs/utility/gmxassert.h"
 
 #ifndef DOXYGEN
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will fail with OpenCL
- * and SYCL but succeed with CUDA.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent(). For this, event.reset() call
- * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
- * the GMX_ASSERT(). This was tested to work both with CUDA, NVidia OpenCL, and Intel SYCL,
- * but not with AMD/Intel OpenCl.
- *
- *  \warning This class is offered for uniformity with other GPU implementations, but expect it to
- *  be deprecated in the future.
- *
- */
-class GpuEventSynchronizer
+
+class DeviceEvent
 {
 public:
     //! A constructor.
-    GpuEventSynchronizer()
+    DeviceEvent()
     {
         doNotSynchronizeBetweenStreams_ = (std::getenv("GMX_GPU_SYCL_NO_SYNCHRONIZE") != nullptr);
         events_.reserve(1);
     }
     //! A constructor from an existing event.
-    GpuEventSynchronizer(const cl::sycl::event& event) : events_{ event } {}
+    DeviceEvent(const cl::sycl::event& event) : events_{ event } {}
     //! A destructor.
-    ~GpuEventSynchronizer() = default;
-    //! No copying
-    GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
-    //! No assignment
-    GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
-    //! Moving is disabled but can be considered in the future if needed
-    GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+    ~DeviceEvent() = default;
+    // Disable copy, move, and assignment. They all can be allowed, but not needed yet.
+    DeviceEvent& operator=(const DeviceEvent&) = delete;
+    DeviceEvent(const DeviceEvent&)            = delete;
+    DeviceEvent& operator=(DeviceEvent&&) = delete;
+    DeviceEvent(DeviceEvent&&)            = delete;
 
     /*! \brief Marks the synchronization point in the \p deviceStream.
-     * Should be called first and then followed by waitForEvent() or enqueueWaitEvent().
+     * Should be called first and then followed by wait() or enqueueWait().
      */
-    inline void markEvent(const DeviceStream& deviceStream)
+    inline void mark(const DeviceStream& deviceStream)
     {
-        GMX_ASSERT(!isMarked(), "Do not call markEvent more than once!");
 #    if GMX_SYCL_HIPSYCL
         // Relies on HIPSYCL_EXT_QUEUE_WAIT_LIST extension
         events_ = deviceStream.stream().get_wait_list();
@@ -111,39 +91,21 @@ public:
         events_ = { deviceStream.stream().submit_barrier() };
 #    endif
     }
-    /*! \brief Synchronizes the host thread on the marked event.
-     * As in the OpenCL implementation, the event is released.
-     */
-    inline void waitForEvent()
+
+    //! Synchronizes the host thread on the marked event.
+    inline void wait()
     {
-        GMX_ASSERT(isMarked(), "Don't call waitForEvent before marking the event!");
 #    if GMX_SYCL_DPCPP
-        GMX_ASSERT(events_.size() == 1, "One event expected in DPCPP, but we have several!");
+        // Note: this is not to prevent use-before-marking, but for checking the DPC++ vs hipSYCL consistency
+        GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!");
 #    endif
         for (auto& event : events_)
         {
             event.wait_and_throw();
         }
-        reset();
-    }
-    /*! \brief Checks the completion of the underlying event and resets the object if it was. */
-    inline bool isReady()
-    {
-        bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) {
-            auto info       = event.get_info<cl::sycl::info::event::command_execution_status>();
-            bool isComplete = (info == cl::sycl::info::event_command_status::complete);
-            return isComplete;
-        });
-        if (allReady)
-        {
-            reset();
-        }
-        return allReady;
     }
-    /*! \brief Enqueues a wait for the recorded event in stream \p deviceStream.
-     * As in the OpenCL implementation, the event is released.
-     */
-    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+
+    inline void enqueueWait(const DeviceStream& deviceStream)
     {
         if (!doNotSynchronizeBetweenStreams_)
         {
@@ -151,21 +113,33 @@ public:
             // Submit an empty kernel that depends on all the events recorded.
             deviceStream.stream().single_task(events_, [=]() {});
 #    else
+            GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!");
             // Relies on SYCL_INTEL_enqueue_barrier extensions
-            GMX_ASSERT(events_.size() == 1, "Only one event expected in DPCPP!");
             deviceStream.stream().submit_barrier(events_);
 #    endif
         }
-        reset();
     }
+
+    //! Checks the completion of the underlying event.
+    inline bool isReady()
+    {
+        bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) {
+            auto info       = event.get_info<cl::sycl::info::event::command_execution_status>();
+            bool isComplete = (info == cl::sycl::info::event_command_status::complete);
+            return isComplete;
+        });
+        return allReady;
+    }
+
+    //! Checks whether this object encapsulates an underlying event.
+    inline bool isMarked() { return !events_.empty(); }
+
     //! Reset the event to unmarked state.
     inline void reset() { events_.clear(); }
-    //! Check if the event is marked. Needed for some workarounds for #3988
-    inline bool isMarked() const { return !events_.empty(); }
 
 private:
     std::vector<cl::sycl::event> events_;
-    /*! \brief Dev. setting to no-op enqueueWaitEvent
+    /*! \brief Dev. setting to no-op enqueueWait
      *
      * In SYCL, dependencies between the GPU tasks are managed by the runtime, so manual
      * synchronization between GPU streams should be redundant, but we keep it on by default.
@@ -176,6 +150,6 @@ private:
     bool doNotSynchronizeBetweenStreams_;
 };
 
-#endif // !defined DOXYGEN
+#endif // DOXYGEN
 
-#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
+#endif // GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
deleted file mode 100644
index f40c7d446f..0000000000
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- *  \brief Implements a GpuEventSynchronizer class for CUDA.
- *
- *  \author Aleksei Iupinov <a.yupinov@gmail.com>
- *  \inlibraryapi
- */
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
-
-#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_stream.h"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/utility/gmxassert.h"
-
-#ifndef DOXYGEN
-
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will succeed with CUDA but fail with OpenCL.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent().
- * For this, only some small alterations to gpueventsynchronizer_ocl.h need to be made.
- * This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCL.
- */
-class GpuEventSynchronizer
-{
-public:
-    GpuEventSynchronizer()
-    {
-        cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str());
-    }
-    ~GpuEventSynchronizer()
-    {
-        cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str());
-    }
-    //! No copying
-    GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
-    //! No assignment
-    GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
-    //! Moving is disabled but can be considered in the future if needed
-    GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
-
-    /*! \brief Marks the synchronization point in the \p stream.
-     * Should be followed by waitForEvent().
-     */
-    inline void markEvent(const DeviceStream& deviceStream)
-    {
-        cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
-        GMX_ASSERT(stat == cudaSuccess,
-                   ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str());
-    }
-    /*! \brief Synchronizes the host thread on the marked event. */
-    inline void waitForEvent()
-    {
-        cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
-        GMX_ASSERT(stat == cudaSuccess,
-                   ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
-    }
-    /*! \brief Checks the completion of the underlying event and resets the object if it was. */
-    inline bool isReady()
-    {
-        cudaError_t gmx_used_in_debug stat = cudaEventQuery(event_);
-        GMX_ASSERT((stat == cudaSuccess) || (stat == cudaErrorNotReady),
-                   ("cudaEventQuery failed. " + gmx::getDeviceErrorString(stat)).c_str());
-        return (stat == cudaSuccess);
-    }
-    /*! \brief Enqueues a wait for the recorded event in stream \p stream */
-    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
-    {
-        cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
-        GMX_ASSERT(stat == cudaSuccess,
-                   ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
-    }
-    //! Reset the event (not needed in CUDA)
-    inline void reset() {}
-
-private:
-    cudaEvent_t event_;
-};
-
-#endif
-
-#endif
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.h b/src/gromacs/gpu_utils/gpueventsynchronizer.h
index 3045c34e38..d88dba533f 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.h
@@ -33,22 +33,114 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \libinternal \file
- *  \brief Implements a GpuEventSynchronizer class for CUDA.
+ *  \brief Implements a GpuEventSynchronizer class.
  *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
- *  \inlibraryapi
+ * \inlibraryapi
  */
 #ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
 #define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
 
 #include "config.h"
 
-#if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#elif GMX_GPU_OPENCL
-#    include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
-#elif GMX_GPU_SYCL
-#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "device_event.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with \ref markEvent() and then later waited on with \ref waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
+ */
+class GpuEventSynchronizer
+{
+public:
+    //! A constructor
+    GpuEventSynchronizer() = default;
+    //! A destructor
+    ~GpuEventSynchronizer() = default;
+    //! Remove copy assignment, because we can not copy the underlying event object.
+    GpuEventSynchronizer& operator=(const GpuEventSynchronizer&) = delete;
+    //! Remove copy constructor, because we can not copy the underlying event object.
+    GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
+    //! Remove move assignment, because we don't allow moving the underlying event object.
+    GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
+    //! Remove move constructor, because we don't allow moving the underlying event object.
+    GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+
+    /*! \brief Marks the synchronization point in the \p stream.
+     * Should be called first and then followed by \ref waitForEvent().
+     */
+    inline void markEvent(const DeviceStream& deviceStream)
+    {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+        if (event_.isMarked())
+        {
+            GMX_THROW(gmx::InternalError("Trying to mark event before first consuming it"));
+        }
+#endif
+        event_.mark(deviceStream);
+    }
+    /*! \brief Synchronizes the host thread on the marked event. */
+    inline void waitForEvent()
+    {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+        if (!event_.isMarked())
+        {
+            GMX_THROW(gmx::InternalError(
+                    "Trying to wait for event before marking it or after fully consuming it"));
+        }
 #endif
+        event_.wait();
+        reset();
+    }
+    /*! \brief Checks the completion of the underlying event and resets the object if it was. */
+    inline bool isReady()
+    {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+        if (!event_.isMarked())
+        {
+            GMX_THROW(gmx::InternalError("Trying to check the status of event before marking it"));
+        }
+#endif
+        bool isReady = event_.isReady();
+        if (isReady)
+        {
+            reset();
+        }
+        return isReady;
+    }
+    /*! \brief Enqueues a wait for the recorded event in stream \p stream
+     *
+     *  After enqueue, the associated event is released, so this method should
+     *  be only called once per \ref markEvent() call (not enforced in CUDA yet).
+     */
+    inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+    {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+        if (!event_.isMarked())
+        {
+            GMX_THROW(
+                    gmx::InternalError("Trying to enqueue wait for event before marking it or "
+                                       "after fully consuming it"));
+        }
+#endif
+        event_.enqueueWait(deviceStream);
+        reset();
+    }
+
+    //! Resets the event to unmarked state, releasing the underlying event object if needed.
+    inline void reset() { event_.reset(); }
 
-#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
+private:
+    DeviceEvent event_;
+};
+
+#endif
diff --git a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
index 7f28867762..dc1412c170 100644
--- a/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
+++ b/src/gromacs/gpu_utils/tests/gpueventsynchronizer.cpp
@@ -41,6 +41,8 @@
  */
 #include "gmxpre.h"
 
+#include "config.h"
+
 #include "gromacs/gpu_utils/gpueventsynchronizer.h"
 
 #include <gtest/gtest.h>
@@ -100,6 +102,26 @@ TEST(GpuEventSynchronizerTest, BasicFunctionality)
             gpuEventSynchronizer.markEvent(streamB);
             gpuEventSynchronizer.waitForEvent();
         }
+
+#    if !GMX_GPU_CUDA // CUDA has very lax rules for event consumption. See Issues #2527 and #3988.
+        {
+            SCOPED_TRACE("Wait before marking");
+            GpuEventSynchronizer gpuEventSynchronizer;
+            EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
+        }
+        {
+            SCOPED_TRACE("enqueueWait before marking");
+            GpuEventSynchronizer gpuEventSynchronizer;
+            EXPECT_THROW(gpuEventSynchronizer.enqueueWaitEvent(streamA), gmx::InternalError);
+        }
+        {
+            SCOPED_TRACE("Wait twice after marking");
+            GpuEventSynchronizer gpuEventSynchronizer;
+            gpuEventSynchronizer.markEvent(streamA);
+            gpuEventSynchronizer.waitForEvent();
+            EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
+        }
+#    endif
     }
 }
 #endif // GMX_GPU
-- 
2.22.0