No new functionality or added checks.
Preparatory work for adding advanced event accounting logic to GpuEventSynchronizer.
DeviceEvent class contains only backend-specific functions and minimal sanity checks.
The more advanced logic (the one we're trying to fix in #3988) is kept in
GpuEventSynchronizer, and currently left unchanged (1:1 for OpenCL and SYCL,
very relaxed rules for CUDA).
Refs #2527, #3988.
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Implements a DeviceEvent class for CUDA.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_CUH
+#define GMX_GPU_UTILS_DEVICE_EVENT_CUH
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+#ifndef DOXYGEN
+
+class DeviceEvent
+{
+public:
+ DeviceEvent()
+ {
+ cudaError_t stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError("cudaEventCreate failed: " + gmx::getDeviceErrorString(stat)));
+ }
+ }
+ ~DeviceEvent() { cudaEventDestroy(event_); }
+ // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+ DeviceEvent& operator=(const DeviceEvent&) = delete;
+ DeviceEvent(const DeviceEvent&) = delete;
+ DeviceEvent& operator=(DeviceEvent&&) = delete;
+ DeviceEvent(DeviceEvent&&) = delete;
+
+ /*! \brief Marks the synchronization point in the \p stream.
+ * Should be followed by waitForEvent().
+ */
+ inline void mark(const DeviceStream& deviceStream)
+ {
+ cudaError_t stat = cudaEventRecord(event_, deviceStream.stream());
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError("cudaEventRecord failed: " + gmx::getDeviceErrorString(stat)));
+ }
+ }
+ //! Synchronizes the host thread on the marked event.
+ inline void wait()
+ {
+ cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError("cudaEventSynchronize failed: " + gmx::getDeviceErrorString(stat)));
+ }
+ }
+ //! Checks the completion of the underlying event.
+ inline bool isReady()
+ {
+ cudaError_t stat = cudaEventQuery(event_);
+ if (stat != cudaSuccess && stat != cudaErrorNotReady)
+ {
+ GMX_THROW(gmx::InternalError("cudaEventQuery failed: " + gmx::getDeviceErrorString(stat)));
+ }
+ return (stat == cudaSuccess);
+ }
+ //! Enqueues a wait for the recorded event in stream \p stream
+ inline void enqueueWait(const DeviceStream& deviceStream)
+ {
+ cudaError_t stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError("cudaStreamWaitEvent failed: " + gmx::getDeviceErrorString(stat)));
+ }
+ }
+ //! Reset the event (not needed in CUDA)
+ inline void reset() {}
+
+private:
+ cudaEvent_t event_;
+};
+
+#endif
+
+#endif
\ No newline at end of file
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declares DeviceEvent for all build configuraitons
+ *
+ * This header may be included from any build configuration and
+ * defers valid GPU declarations to headers valid only in such
+ * build configurations.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_H
+
+#include "config.h"
+
+#include "gromacs/utility/exceptions.h"
+
+#if !GMX_GPU || defined(DOXYGEN)
+
+class DeviceStream;
+
+// [[noreturn]] attributes must be added to the methods, so it's
+// easier to silence the warning here and avoid them appearing in
+// the Doxygen
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+class DeviceEvent
+{
+public:
+ DeviceEvent() = default;
+ // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+ DeviceEvent& operator=(const DeviceEvent&) = delete;
+ DeviceEvent(const DeviceEvent&) = delete;
+ DeviceEvent& operator=(DeviceEvent&&) = delete;
+ DeviceEvent(DeviceEvent&&) = delete;
+
+ /*! \brief Marks the synchronization point in the \p stream.
+ * Should be followed by waitForEvent().
+ */
+ inline void mark(const DeviceStream& /*deviceStream*/) // NOLINT readability-convert-member-functions-to-static
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+ //! Synchronizes the host thread on the marked event.
+ inline void wait() // NOLINT readability-convert-member-functions-to-static
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+ //! Checks the completion of the underlying event.
+ inline bool isReady() // NOLINT readability-convert-member-functions-to-static
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+ //! Enqueues a wait for the recorded event in stream \p stream
+ // NOLINTNEXTLINE readability-convert-member-functions-to-static
+ inline void enqueueWait(const DeviceStream& /*deviceStream*/)
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+ //! Checks whether this object encapsulates an underlying event.
+ inline bool isMarked() // NOLINT readability-convert-member-functions-to-static
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+
+ //! Reset the event (not needed in CUDA)
+ // NOLINTNEXTLINE readability-convert-member-functions-to-static
+ inline void reset() // NOLINT readability-convert-member-functions-to-static
+ {
+ GMX_THROW(gmx::NotImplementedError("Not implemented for non-GPU build"));
+ }
+};
+
+# pragma clang diagnostic pop
+
+#elif GMX_GPU_CUDA
+# include "device_event.cuh"
+#elif GMX_GPU_OPENCL
+# include "device_event_ocl.h"
+#elif GMX_GPU_SYCL
+# include "device_event_sycl.h"
+#endif
+
+#endif // GMX_GPU_UTILS_DEVICE_EVENT_H
* the research papers on the package. Check out http://www.gromacs.org.
*/
/*! \libinternal \file
- * \brief Implements a GpuEventSynchronizer class for OpenCL.
+ * \brief Implements a DeviceEvent class for OpenCL.
*
* \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
* \inlibraryapi
*/
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_OCL_H
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_OCL_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_OCL_H
-#ifndef DOXYGEN
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
-# include "gromacs/gpu_utils/gputraits_ocl.h"
-# include "gromacs/gpu_utils/oclutils.h"
-# include "gromacs/utility/exceptions.h"
-# include "gromacs/utility/gmxassert.h"
+#ifndef DOXYGEN
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent(). For this, clReleaseEvent() call
- * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
- * the GMX_ASSERT(). This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCl.
- */
-class GpuEventSynchronizer
+class DeviceEvent
{
public:
//! A constructor
- GpuEventSynchronizer() : event_(nullptr) {}
+ DeviceEvent() : event_(sc_nullEvent) {}
+ DeviceEvent(cl_event event) : event_(event) {}
//! A destructor
- ~GpuEventSynchronizer()
+ ~DeviceEvent()
{
- // This additional code only prevents cl_event leak in an unlikely situation of destructor
- // being called after markEvent() but before waitForEvent() / enqueueWaitEvent().
- if (event_)
+ if (isMarked())
{
+ // Can not throw in destructor, so not checking for any error
clReleaseEvent(event_);
}
}
- //! No copying
- GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
- //! No assignment
- GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
- //! Moving is disabled but can be considered in the future if needed
- GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+ // Disable copy, move, and assignment. Move can be allowed, but not needed yet.
+ DeviceEvent& operator=(const DeviceEvent&) = delete;
+ DeviceEvent(const DeviceEvent&) = delete;
+ DeviceEvent& operator=(DeviceEvent&&) = delete;
+ DeviceEvent(DeviceEvent&&) = delete;
/*! \brief Marks the synchronization point in the \p stream.
- * Should be called first and then followed by waitForEvent().
+ * Should be called first and then followed by wait().
*/
- inline void markEvent(const DeviceStream& deviceStream)
+ inline void mark(const DeviceStream& deviceStream)
{
- GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
+ reset();
cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_);
if (CL_SUCCESS != clError)
{
+ ocl_get_error_string(clError)));
}
}
+
/*! \brief Synchronizes the host thread on the marked event. */
- inline void waitForEvent()
+ inline void wait()
{
+ GMX_RELEASE_ASSERT(isMarked(), "Can not wait for an unmarked event");
cl_int clError = clWaitForEvents(1, &event_);
if (CL_SUCCESS != clError)
{
GMX_THROW(gmx::InternalError("Failed to synchronize on the GPU event: "
+ ocl_get_error_string(clError)));
}
+ }
- reset();
+ /*! \brief Enqueues a wait for the recorded event in stream \p stream. */
+ inline void enqueueWait(const DeviceStream& deviceStream)
+ {
+ GMX_RELEASE_ASSERT(isMarked(), "Can not enqueue an unmarked event");
+ cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
+ if (CL_SUCCESS != clError)
+ {
+ GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
+ + ocl_get_error_string(clError)));
+ }
}
- /*! \brief Checks the completion of the underlying event and resets the object if it was. */
+
+ //! Checks the completion of the underlying event.
inline bool isReady()
{
+ GMX_RELEASE_ASSERT(isMarked(), "Can not check the status of unmarked event");
cl_int result;
cl_int clError = clGetEventInfo(
event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &result, nullptr);
{
GMX_THROW(gmx::InternalError("Failed to retrieve event info: " + ocl_get_error_string(clError)));
}
- bool hasTriggered = (result == CL_COMPLETE);
- if (hasTriggered)
- {
- reset();
- }
- return hasTriggered;
+ return (result == CL_COMPLETE);
}
- /*! \brief Enqueues a wait for the recorded event in stream \p stream
- *
- * After enqueue, the associated event is released, so this method should
- * be only called once per markEvent() call.
- */
- inline void enqueueWaitEvent(const DeviceStream& deviceStream)
- {
- cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
- if (CL_SUCCESS != clError)
- {
- GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
- + ocl_get_error_string(clError)));
- }
- reset();
- }
+ //! Checks whether this object encapsulates an underlying event.
+ inline bool isMarked() { return event_ != sc_nullEvent; }
//! Reset (release) the event to unmarked state.
inline void reset()
{
- cl_int clError = clReleaseEvent(event_);
- if (CL_SUCCESS != clError)
+ if (isMarked())
{
- GMX_THROW(gmx::InternalError("Failed to release the GPU event: "
- + ocl_get_error_string(clError)));
+ cl_int clError = clReleaseEvent(event_);
+ if (CL_SUCCESS != clError)
+ {
+ GMX_THROW(gmx::InternalError("Failed to release the GPU event: "
+ + ocl_get_error_string(clError)));
+ }
}
- event_ = nullptr;
+ event_ = sc_nullEvent;
}
private:
cl_event event_;
+
+ //! Magic value to indicate uninitialized state.
+ static constexpr cl_event sc_nullEvent = nullptr;
};
#endif
* \author Andrey Alekseenko <al42and@gmail.com>
* \inlibraryapi
*/
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
+#ifndef GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
+#define GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
-#include <optional>
+#include <vector>
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/utility/gmxassert.h"
#ifndef DOXYGEN
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will fail with OpenCL
- * and SYCL but succeed with CUDA.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent(). For this, event.reset() call
- * from waitForEvent() should instead happen conditionally at the beginning of markEvent(), replacing
- * the GMX_ASSERT(). This was tested to work both with CUDA, NVidia OpenCL, and Intel SYCL,
- * but not with AMD/Intel OpenCl.
- *
- * \warning This class is offered for uniformity with other GPU implementations, but expect it to
- * be deprecated in the future.
- *
- */
-class GpuEventSynchronizer
+
+class DeviceEvent
{
public:
//! A constructor.
- GpuEventSynchronizer()
+ DeviceEvent()
{
doNotSynchronizeBetweenStreams_ = (std::getenv("GMX_GPU_SYCL_NO_SYNCHRONIZE") != nullptr);
events_.reserve(1);
}
//! A constructor from an existing event.
- GpuEventSynchronizer(const cl::sycl::event& event) : events_{ event } {}
+ DeviceEvent(const cl::sycl::event& event) : events_{ event } {}
//! A destructor.
- ~GpuEventSynchronizer() = default;
- //! No copying
- GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
- //! No assignment
- GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
- //! Moving is disabled but can be considered in the future if needed
- GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+ ~DeviceEvent() = default;
+ // Disable copy, move, and assignment. They all can be allowed, but not needed yet.
+ DeviceEvent& operator=(const DeviceEvent&) = delete;
+ DeviceEvent(const DeviceEvent&) = delete;
+ DeviceEvent& operator=(DeviceEvent&&) = delete;
+ DeviceEvent(DeviceEvent&&) = delete;
/*! \brief Marks the synchronization point in the \p deviceStream.
- * Should be called first and then followed by waitForEvent() or enqueueWaitEvent().
+ * Should be called first and then followed by wait() or enqueueWait().
*/
- inline void markEvent(const DeviceStream& deviceStream)
+ inline void mark(const DeviceStream& deviceStream)
{
- GMX_ASSERT(!isMarked(), "Do not call markEvent more than once!");
# if GMX_SYCL_HIPSYCL
// Relies on HIPSYCL_EXT_QUEUE_WAIT_LIST extension
events_ = deviceStream.stream().get_wait_list();
events_ = { deviceStream.stream().submit_barrier() };
# endif
}
- /*! \brief Synchronizes the host thread on the marked event.
- * As in the OpenCL implementation, the event is released.
- */
- inline void waitForEvent()
+
+ //! Synchronizes the host thread on the marked event.
+ inline void wait()
{
- GMX_ASSERT(isMarked(), "Don't call waitForEvent before marking the event!");
# if GMX_SYCL_DPCPP
- GMX_ASSERT(events_.size() == 1, "One event expected in DPCPP, but we have several!");
+ // Note: this is not to prevent use-before-marking, but for checking the DPC++ vs hipSYCL consistency
+ GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!");
# endif
for (auto& event : events_)
{
event.wait_and_throw();
}
- reset();
- }
- /*! \brief Checks the completion of the underlying event and resets the object if it was. */
- inline bool isReady()
- {
- bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) {
- auto info = event.get_info<cl::sycl::info::event::command_execution_status>();
- bool isComplete = (info == cl::sycl::info::event_command_status::complete);
- return isComplete;
- });
- if (allReady)
- {
- reset();
- }
- return allReady;
}
- /*! \brief Enqueues a wait for the recorded event in stream \p deviceStream.
- * As in the OpenCL implementation, the event is released.
- */
- inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+
+ inline void enqueueWait(const DeviceStream& deviceStream)
{
if (!doNotSynchronizeBetweenStreams_)
{
// Submit an empty kernel that depends on all the events recorded.
deviceStream.stream().single_task(events_, [=]() {});
# else
+ GMX_ASSERT(events_.size() <= 1, "One event expected in DPC++, but we have several!");
// Relies on SYCL_INTEL_enqueue_barrier extensions
- GMX_ASSERT(events_.size() == 1, "Only one event expected in DPCPP!");
deviceStream.stream().submit_barrier(events_);
# endif
}
- reset();
}
+
+ //! Checks the completion of the underlying event.
+ inline bool isReady()
+ {
+ bool allReady = std::all_of(events_.begin(), events_.end(), [](cl::sycl::event& event) {
+ auto info = event.get_info<cl::sycl::info::event::command_execution_status>();
+ bool isComplete = (info == cl::sycl::info::event_command_status::complete);
+ return isComplete;
+ });
+ return allReady;
+ }
+
+ //! Checks whether this object encapsulates an underlying event.
+ inline bool isMarked() { return !events_.empty(); }
+
//! Reset the event to unmarked state.
inline void reset() { events_.clear(); }
- //! Check if the event is marked. Needed for some workarounds for #3988
- inline bool isMarked() const { return !events_.empty(); }
private:
std::vector<cl::sycl::event> events_;
- /*! \brief Dev. setting to no-op enqueueWaitEvent
+ /*! \brief Dev. setting to no-op enqueueWait
*
* In SYCL, dependencies between the GPU tasks are managed by the runtime, so manual
* synchronization between GPU streams should be redundant, but we keep it on by default.
bool doNotSynchronizeBetweenStreams_;
};
-#endif // !defined DOXYGEN
+#endif // DOXYGEN
-#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_SYCL_H
+#endif // GMX_GPU_UTILS_DEVICE_EVENT_SYCL_H
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- * \brief Implements a GpuEventSynchronizer class for CUDA.
- *
- * \author Aleksei Iupinov <a.yupinov@gmail.com>
- * \inlibraryapi
- */
-#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
-#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
-
-#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_stream.h"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/utility/gmxassert.h"
-
-#ifndef DOXYGEN
-
-/*! \libinternal \brief
- * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
- * The event can be put into the stream with markEvent() and then later waited on with waitForEvent().
- * This can be repeated as necessary, but the current implementation does not allow waiting on
- * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
- * The class generally attempts to track the correctness of its state transitions, but
- * please note that calling waitForEvent() right after the construction will succeed with CUDA but fail with OpenCL.
- *
- * Another possible mode of operation can be implemented if needed:
- * multiple calls to waitForEvent() after a single markEvent().
- * For this, only some small alterations to gpueventsynchronizer_ocl.h need to be made.
- * This was tested to work both with CUDA and NVidia OpenCL, but not with AMD/Intel OpenCL.
- */
-class GpuEventSynchronizer
-{
-public:
- GpuEventSynchronizer()
- {
- cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str());
- }
- ~GpuEventSynchronizer()
- {
- cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_);
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str());
- }
- //! No copying
- GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
- //! No assignment
- GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
- //! Moving is disabled but can be considered in the future if needed
- GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
-
- /*! \brief Marks the synchronization point in the \p stream.
- * Should be followed by waitForEvent().
- */
- inline void markEvent(const DeviceStream& deviceStream)
- {
- cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
- GMX_ASSERT(stat == cudaSuccess,
- ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str());
- }
- /*! \brief Synchronizes the host thread on the marked event. */
- inline void waitForEvent()
- {
- cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
- GMX_ASSERT(stat == cudaSuccess,
- ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
- }
- /*! \brief Checks the completion of the underlying event and resets the object if it was. */
- inline bool isReady()
- {
- cudaError_t gmx_used_in_debug stat = cudaEventQuery(event_);
- GMX_ASSERT((stat == cudaSuccess) || (stat == cudaErrorNotReady),
- ("cudaEventQuery failed. " + gmx::getDeviceErrorString(stat)).c_str());
- return (stat == cudaSuccess);
- }
- /*! \brief Enqueues a wait for the recorded event in stream \p stream */
- inline void enqueueWaitEvent(const DeviceStream& deviceStream)
- {
- cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
- GMX_ASSERT(stat == cudaSuccess,
- ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
- }
- //! Reset the event (not needed in CUDA)
- inline void reset() {}
-
-private:
- cudaEvent_t event_;
-};
-
-#endif
-
-#endif
* the research papers on the package. Check out http://www.gromacs.org.
*/
/*! \libinternal \file
- * \brief Implements a GpuEventSynchronizer class for CUDA.
+ * \brief Implements a GpuEventSynchronizer class.
*
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
* \author Aleksei Iupinov <a.yupinov@gmail.com>
- * \inlibraryapi
+ * \inlibraryapi
*/
#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
#include "config.h"
-#if GMX_GPU_CUDA
-# include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#elif GMX_GPU_OPENCL
-# include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
-#elif GMX_GPU_SYCL
-# include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "device_event.h"
+
+/*! \libinternal \brief
+ * A class which allows for CPU thread to mark and wait for certain GPU stream execution point.
+ * The event can be put into the stream with \ref markEvent() and then later waited on with \ref waitForEvent().
+ * This can be repeated as necessary, but the current implementation does not allow waiting on
+ * completed event more than once, expecting only exact pairs of markEvent(stream); waitForEvent().
+ * The class generally attempts to track the correctness of its state transitions, but
+ * please note that calling waitForEvent() right after the construction will fail with OpenCL but succeed with CUDA.
+ */
+class GpuEventSynchronizer
+{
+public:
+ //! A constructor
+ GpuEventSynchronizer() = default;
+ //! A destructor
+ ~GpuEventSynchronizer() = default;
+ //! Remove copy assignment, because we can not copy the underlying event object.
+ GpuEventSynchronizer& operator=(const GpuEventSynchronizer&) = delete;
+ //! Remove copy constructor, because we can not copy the underlying event object.
+ GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
+ //! Remove move assignment, because we don't allow moving the underlying event object.
+ GpuEventSynchronizer& operator=(GpuEventSynchronizer&&) = delete;
+ //! Remove move constructor, because we don't allow moving the underlying event object.
+ GpuEventSynchronizer(GpuEventSynchronizer&&) = delete;
+
+ /*! \brief Marks the synchronization point in the \p stream.
+ * Should be called first and then followed by \ref waitForEvent().
+ */
+ inline void markEvent(const DeviceStream& deviceStream)
+ {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+ if (event_.isMarked())
+ {
+ GMX_THROW(gmx::InternalError("Trying to mark event before first consuming it"));
+ }
+#endif
+ event_.mark(deviceStream);
+ }
+ /*! \brief Synchronizes the host thread on the marked event. */
+ inline void waitForEvent()
+ {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+ if (!event_.isMarked())
+ {
+ GMX_THROW(gmx::InternalError(
+ "Trying to wait for event before marking it or after fully consuming it"));
+ }
#endif
+ event_.wait();
+ reset();
+ }
+ /*! \brief Checks the completion of the underlying event and resets the object if it was. */
+ inline bool isReady()
+ {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+ if (!event_.isMarked())
+ {
+ GMX_THROW(gmx::InternalError("Trying to check the status of event before marking it"));
+ }
+#endif
+ bool isReady = event_.isReady();
+ if (isReady)
+ {
+ reset();
+ }
+ return isReady;
+ }
+ /*! \brief Enqueues a wait for the recorded event in stream \p stream
+ *
+ * After enqueue, the associated event is released, so this method should
+ * be only called once per \ref markEvent() call (not enforced in CUDA yet).
+ */
+ inline void enqueueWaitEvent(const DeviceStream& deviceStream)
+ {
+#if !GMX_GPU_CUDA // For now, we have relaxed conditions for CUDA
+ if (!event_.isMarked())
+ {
+ GMX_THROW(
+ gmx::InternalError("Trying to enqueue wait for event before marking it or "
+ "after fully consuming it"));
+ }
+#endif
+ event_.enqueueWait(deviceStream);
+ reset();
+ }
+
+ //! Resets the event to unmarked state, releasing the underlying event object if needed.
+ inline void reset() { event_.reset(); }
-#endif // GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_H
+private:
+ DeviceEvent event_;
+};
+
+#endif
*/
#include "gmxpre.h"
+#include "config.h"
+
#include "gromacs/gpu_utils/gpueventsynchronizer.h"
#include <gtest/gtest.h>
gpuEventSynchronizer.markEvent(streamB);
gpuEventSynchronizer.waitForEvent();
}
+
+# if !GMX_GPU_CUDA // CUDA has very lax rules for event consumption. See Issues #2527 and #3988.
+ {
+ SCOPED_TRACE("Wait before marking");
+ GpuEventSynchronizer gpuEventSynchronizer;
+ EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
+ }
+ {
+ SCOPED_TRACE("enqueueWait before marking");
+ GpuEventSynchronizer gpuEventSynchronizer;
+ EXPECT_THROW(gpuEventSynchronizer.enqueueWaitEvent(streamA), gmx::InternalError);
+ }
+ {
+ SCOPED_TRACE("Wait twice after marking");
+ GpuEventSynchronizer gpuEventSynchronizer;
+ gpuEventSynchronizer.markEvent(streamA);
+ gpuEventSynchronizer.waitForEvent();
+ EXPECT_THROW(gpuEventSynchronizer.waitForEvent(), gmx::InternalError);
+ }
+# endif
}
}
#endif // GMX_GPU