From 7180ba07a2fe6ae2e66879ea80ed92bec802aeaf Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Fri, 16 Jun 2017 18:04:51 +0200
Subject: [PATCH] Add GpuRegionTimer class

GpuRegionTimer class is implemented in CUDA/OpenCL to help timing
regions of GPU code. For CUDA, it contains pair of CUDA start/stop events;
for OpenCL, it contains fixed-size array of OpenCL events, which are
passed manually into OpenCL calls within region.
Using this class, cu/cl_timers_t structures and the GPU timing code
are made a bit more uniform.

Change-Id: I8fcd7568653c5bcf3496a2bee227ae03f14a756a
---
 src/gromacs/gpu_utils/gpuregiontimer.cuh      | 103 +++++++++
 src/gromacs/gpu_utils/gpuregiontimer.h        | 216 ++++++++++++++++++
 src/gromacs/gpu_utils/gpuregiontimer_ocl.h    | 129 +++++++++++
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    |  54 ++---
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  94 +-------
 .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h       |  30 +--
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp     |  90 +++++---
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   |  38 ++-
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h |  33 +--
 9 files changed, 584 insertions(+), 203 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/gpuregiontimer.cuh
 create mode 100644 src/gromacs/gpu_utils/gpuregiontimer.h
 create mode 100644 src/gromacs/gpu_utils/gpuregiontimer_ocl.h

diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh
new file mode 100644
index 0000000000..b709c1206e
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -0,0 +1,103 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *  \brief Implements the GPU region timer for CUDA.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_CUH
+#define GMX_GPU_UTILS_GPUREGIONTIMER_CUH
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+
+#include "gpuregiontimer.h"
+
+template <> struct GpuTraits<GpuFramework::CUDA>
+{
+    using CommandStream = cudaStream_t;
+    using CommandEvent  = void;
+};
+
+//! Short-hand for external use
+using             GpuRegionTimer = GpuRegionTimerWrapper<GpuFramework::CUDA>;
+
+template <> class GpuRegionTimerImpl<GpuFramework::CUDA>
+{
+    //! Short-hand
+    using       CommandStream = typename GpuTraits<GpuFramework::CUDA>::CommandStream;
+    //! The underlying timing event pair - the beginning and the end of the timespan
+    cudaEvent_t eventStart_, eventStop_;
+
+    public:
+
+        GpuRegionTimerImpl()
+        {
+            const int eventFlags = cudaEventDefault;
+            CU_RET_ERR(cudaEventCreate(&eventStart_, eventFlags), "GPU timing creation failure");
+            CU_RET_ERR(cudaEventCreate(&eventStop_, eventFlags), "GPU timing creation failure");
+        }
+
+        ~GpuRegionTimerImpl()
+        {
+            CU_RET_ERR(cudaEventDestroy(eventStart_), "GPU timing destruction failure");
+            CU_RET_ERR(cudaEventDestroy(eventStop_), "GPU timing destruction failure");
+        }
+
+        inline void openTimingRegion(CommandStream s)
+        {
+            CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure");
+        }
+
+        inline void closeTimingRegion(CommandStream s)
+        {
+            CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure");
+        }
+
+        inline double getLastRangeTime()
+        {
+            float milliseconds = 0.0;
+            CU_RET_ERR(cudaEventElapsedTime(&milliseconds, eventStart_, eventStop_), "GPU timing update failure");
+            reset();
+            return milliseconds;
+        }
+
+        inline void reset(){}
+};
+
+#endif
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h
new file mode 100644
index 0000000000..0207f96143
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpuregiontimer.h
@@ -0,0 +1,216 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *  \brief Defines the GPU region timer implementation/wrapper classes.
+ *  The implementations live in gpuregiontimer.cuh for CUDA and gpuregiontimer_ocl.h for OpenCL.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_H
+#define GMX_GPU_UTILS_GPUREGIONTIMER_H
+
+#include <string>
+
+#include "gromacs/utility/gmxassert.h"
+
+//! Debug GPU timers in debug builds only
+#if defined(NDEBUG)
+static const bool c_debugTimerState = false;
+#else
+static const bool c_debugTimerState = true;
+#endif
+
+/*! \libinternal \brief
+ * Enumeration of possible GPU build-paths.
+ * \todo Move somewhere general?
+ */
+enum class GpuFramework
+{
+    CUDA,
+    OpenCL
+};
+
+/*! \libinternal \brief
+ * GPU build-path traits such as types.
+ * \todo Move somewhere general?
+ */
+template <GpuFramework> struct GpuTraits
+{
+    using CommandStream      = void; //!< GPU command stream
+    using CommandEvent       = void; //!< Single command call timing event - used in OpenCL
+};
+
+/*! \libinternal \brief
+ * This is a GPU region timing implementation interface.
+ * It should provide methods for measuring the last timespan.
+ */
+template <GpuFramework framework> class GpuRegionTimerImpl
+{
+    //! Short-hands
+    using CommandStream = typename GpuTraits<framework>::CommandStream;
+    using CommandEvent  = typename GpuTraits<framework>::CommandEvent;
+
+    public:
+        GpuRegionTimerImpl()  = default;
+        ~GpuRegionTimerImpl() = default;
+
+        /*! \brief Will be called before the region start. */
+        inline void openTimingRegion(CommandStream) = 0;
+        /*! \brief Will be called after the region end. */
+        inline void closeTimingRegion(CommandStream) = 0;
+        /*! \brief Resets any internal state if needed */
+        inline void reset() = 0;
+        /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */
+        inline double getLastRangeTime() = 0;
+        /*! \brief Returns a new raw timing event
+         * for passing into individual GPU API calls
+         * within the region if the API requires it (e.g. on OpenCL). */
+        inline CommandEvent *fetchNextEvent() = 0;
+};
+
+/*! \libinternal \brief
+ * This is a GPU region timing wrapper class.
+ * It allows for host-side tracking of the execution timespans in GPU code
+ * (measuring kernel or transfers duration).
+ * It also partially tracks the correctness of the timer state transitions,
+ * as far as current implementation allows (see TODO in getLastRangeTime() for a disabled check).
+ * Internally it uses GpuRegionTimerImpl for measuring regions.
+ */
+template <GpuFramework framework> class GpuRegionTimerWrapper
+{
+    //! Short-hands
+    using CommandStream = typename GpuTraits<framework>::CommandStream;
+    using CommandEvent  = typename GpuTraits<framework>::CommandEvent;
+    //! The timer state used for debug-only assertions
+    enum class TimerState
+    {
+        Idle,
+        Recording,
+        Stopped
+    } debugState_;
+
+    //! The underlying region timer implementation
+    GpuRegionTimerImpl<framework> impl_;
+
+    public:
+        GpuRegionTimerWrapper() : impl_()
+        {
+            reset();
+        }
+        ~GpuRegionTimerWrapper() = default;
+        /*! \brief
+         * To be called before the region start.
+         *
+         * \param[in] s   The GPU command stream where the event being measured takes place.
+         */
+        void openTimingRegion(CommandStream s)
+        {
+            if (c_debugTimerState)
+            {
+                std::string error = "GPU timer should be idle, but is " + std::string((debugState_ == TimerState::Stopped) ? "stopped" : "recording") + ".";
+                GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str());
+                debugState_ = TimerState::Recording;
+            }
+            impl_.openTimingRegion(s);
+        }
+        /*! \brief
+         * To be called after the region end.
+         *
+         * \param[in] s   The GPU command stream where the event being measured takes place.
+         */
+        void closeTimingRegion(CommandStream s)
+        {
+            if (c_debugTimerState)
+            {
+                std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + ".";
+                GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str());
+                debugState_ = TimerState::Stopped;
+            }
+            impl_.closeTimingRegion(s);
+        }
+        /*! \brief
+         * Returns the last timespan, and resets the internal timer state.
+         * To be called after closeTimingRegion() and the command stream of the event having been synchronized.
+         * \returns The last timespan (in milliseconds).
+         */
+        double getLastRangeTime()
+        {
+            if (c_debugTimerState)
+            {
+                /* The assertion below is commented because it is currently triggered on a special case:
+                 * the early return before the local non-bonded kernel launch if there is nothing to do.
+                 * This can be reproduced in OpenCL build by running
+                 * mdrun-mpi-test -ntmpi 2 --gtest_filter=*Empty*
+                 * Similarly, the GpuRegionTimerImpl suffers from non-nullptr
+                 * cl_event conditionals which ideally should only be asserts.
+                 * TODO: improve internal task scheduling, re-enable the assert, turn conditionals into asserts
+                 */
+                /*
+                   std::string error = "GPU timer should be stopped, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "recording") + ".";
+                   GMX_ASSERT(debugState_ == TimerState::Stopped, error.c_str());
+                 */
+                debugState_ = TimerState::Idle;
+            }
+            double milliseconds = impl_.getLastRangeTime();
+            return milliseconds;
+        }
+        /*! \brief Resets the implementation and total time/call count to zeroes. */
+        void reset()
+        {
+            if (c_debugTimerState)
+            {
+                debugState_ = TimerState::Idle;
+            }
+            impl_.reset();
+        }
+        /*! \brief
+         * Gets a pointer to a new timing event for passing into individual GPU API calls
+         * within the region if they require it (e.g. on OpenCL).
+         * \returns The pointer to the underlying single command timing event.
+         */
+        CommandEvent *fetchNextEvent()
+        {
+            if (c_debugTimerState)
+            {
+                std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + ".";
+                GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str());
+            }
+            return impl_.fetchNextEvent();
+        }
+};
+
+#endif
diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
new file mode 100644
index 0000000000..e0b4b3f87f
--- /dev/null
+++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *  \brief Implements the GPU region timer for OpenCL.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ *  \inlibraryapi
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H
+#define GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H
+
+#include <array>
+
+#include "gromacs/gpu_utils/oclutils.h"
+
+#include "gpuregiontimer.h"
+
+template <> struct GpuTraits<GpuFramework::OpenCL>
+{
+    using CommandStream = cl_command_queue;
+    using CommandEvent  = cl_event;
+};
+
+//! Short-hand for external use
+using GpuRegionTimer = GpuRegionTimerWrapper<GpuFramework::OpenCL>;
+
+// cppcheck-suppress noConstructor
+template <> class GpuRegionTimerImpl<GpuFramework::OpenCL>
+{
+    //! Short-hands
+    using CommandStream = typename GpuTraits<GpuFramework::OpenCL>::CommandStream;
+    using CommandEvent  = typename GpuTraits<GpuFramework::OpenCL>::CommandEvent;
+
+    /*! \brief The underlying individual timing events array.
+     * The maximum size is chosen arbitrarily to work with current code, and can be changed.
+     * There is simply no need for run-time resizing, and it's unlikely we'll ever need more than 10.
+     */
+    std::array<cl_event, 10> events_ = {{nullptr}};
+    //! Index of the active event
+    size_t                   currentEvent_ = 0;
+
+    public:
+        GpuRegionTimerImpl()  = default;
+        ~GpuRegionTimerImpl() = default;
+        inline void openTimingRegion(CommandStream){}
+        inline void closeTimingRegion(CommandStream){}
+
+        inline double getLastRangeTime()
+        {
+            double milliseconds = 0.0;
+            for (size_t i = 0; i < currentEvent_; i++)
+            {
+                if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass
+                {
+                    cl_ulong          start_ns, end_ns;
+                    cl_int gmx_unused cl_error;
+
+                    cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_START,
+                                                       sizeof(cl_ulong), &start_ns, nullptr);
+                    GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure");
+                    cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_END,
+                                                       sizeof(cl_ulong), &end_ns, nullptr);
+                    GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure");
+                    milliseconds += (end_ns - start_ns) / 1000000.0;
+                }
+            }
+            reset();
+            return milliseconds;
+        }
+
+        inline void reset()
+        {
+            for (size_t i = 0; i < currentEvent_; i++)
+            {
+                if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass
+                {
+                    GMX_ASSERT(CL_SUCCESS == clReleaseEvent(events_[i]), "OpenCL event release failure");
+                }
+            }
+            currentEvent_ = 0;
+            // As long as we're doing nullptr checks, we might want to be extra cautious.
+            events_.fill(nullptr);
+        }
+
+        inline CommandEvent *fetchNextEvent()
+        {
+            GMX_ASSERT(currentEvent_ < events_.size(), "Increase c_maxEventNumber_ if needed");
+            cl_event *result = &events_[currentEvent_];
+            currentEvent_++;
+            return result;
+        }
+};
+
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 86109f7d0c..ed6d41064d 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -384,8 +384,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_h2d[iloc].openTimingRegion(stream);
     }
 
     /* HtoD x, q */
@@ -394,8 +393,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_h2d[iloc].closeTimingRegion(stream);
     }
 
     /* When we get here all misc operations issues in the local stream as well as
@@ -433,8 +431,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_k[iloc].openTimingRegion(stream);
     }
 
     /* get the pointer to the kernel flavor we need to use */
@@ -490,8 +487,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_k[iloc].closeTimingRegion(stream);
     }
 
 #if (defined(WIN32) || defined( _WIN32 ))
@@ -517,8 +513,6 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
                                        int                     iloc,
                                        int                     numParts)
 {
-    cudaError_t          stat;
-
     cu_atomdata_t       *adat    = nb->atdat;
     cu_nbparam_t        *nbp     = nb->nbparam;
     cu_plist_t          *plist   = nb->plist[iloc];
@@ -569,18 +563,16 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
         return;
     }
 
-    cudaEvent_t startEvent, stopEvent;
+    GpuRegionTimer *timer = nullptr;
     if (bDoTime)
     {
-        startEvent = (plist->haveFreshList ? t->start_prune_k[iloc] : t->start_rollingPrune_k[iloc]);
-        stopEvent  = (plist->haveFreshList ? t->stop_prune_k[iloc]  : t->stop_rollingPrune_k[iloc]);
+        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
     }
 
     /* beginning of timed prune calculation section */
     if (bDoTime)
     {
-        stat = cudaEventRecord(startEvent, stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        timer->openTimingRegion(stream);
     }
 
     /* Kernel launch config:
@@ -654,8 +646,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(stopEvent, stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        timer->closeTimingRegion(stream);
     }
 
 #if (defined(WIN32) || defined( _WIN32 ))
@@ -719,8 +710,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_d2h[iloc].openTimingRegion(stream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
@@ -767,8 +757,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        t->nb_d2h[iloc].closeTimingRegion(stream);
     }
 }
 
@@ -785,7 +774,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
  * \param[inout] timings  GPU task timing data
  * \param[in] iloc        interaction locality
  */
-static void countPruneKernelTime(const cu_timers_t   *timers,
+static void countPruneKernelTime(cu_timers_t         *timers,
                                  gmx_wallclock_gpu_t *timings,
                                  const int            iloc)
 {
@@ -798,14 +787,12 @@ static void countPruneKernelTime(const cu_timers_t   *timers,
     if (timers->didPrune[iloc])
     {
         timings->pruneTime.c++;
-        timings->pruneTime.t += cu_event_elapsed(timers->start_prune_k[iloc],
-                                                 timers->stop_prune_k[iloc]);
+        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
     }
     if (timers->didRollingPrune[iloc])
     {
         timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += cu_event_elapsed(timers->start_rollingPrune_k[iloc],
-                                                        timers->stop_rollingPrune_k[iloc]);
+        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
     }
 }
 
@@ -868,14 +855,11 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
             }
 
             /* kernel timings */
-            timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
-                cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
+            timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime();
 
             /* X/q H2D and F D2H timings */
-            timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
-                                                  timers->stop_nb_h2d[iloc]);
-            timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
-                                                  timers->stop_nb_d2h[iloc]);
+            timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime();
+            timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime();
 
             /* Count the pruning kernel times for both cases:1st pass (at search step)
                and rolling pruning (if called at the previous step).
@@ -892,12 +876,10 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
                 if (LOCAL_A(aloc))
                 {
                     timings->pl_h2d_c++;
-                    timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
-                                                          timers->stop_atdat);
+                    timings->pl_h2d_t += timers->atdat.getLastRangeTime();
                 }
 
-                timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
-                                                      timers->stop_pl_h2d[iloc]);
+                timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime();
 
                 /* Clear the timing flag for the next step */
                 timers->didPairlistH2D[iloc] = false;
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 991e558631..b0ef37ffea 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -67,8 +67,6 @@
 #include "nbnxn_cuda.h"
 #include "nbnxn_cuda_types.h"
 
-static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
-
 /* This is a heuristically determined parameter for the Fermi, Kepler
  * and Maxwell architectures for the minimum size of ci lists by multiplying
  * this constant with the # of multiprocessors on the current device.
@@ -389,47 +387,9 @@ static void init_plist(cu_plist_t *pl)
 /*! Initializes the timer data structure. */
 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 {
-    cudaError_t stat;
-    int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
-
-    stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
-    CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
-    stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
-    CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
-
     /* The non-local counters/stream (second in the array) are needed only with DD. */
     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
     {
-        stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
-
-        stat = cudaEventCreateWithFlags(&(t->start_prune_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_prune_k failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_prune_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_prune_k failed");
-
-        stat = cudaEventCreateWithFlags(&(t->start_rollingPrune_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_rollingPrune_k failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_rollingPrune_k[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_rollingPrune_k failed");
-
-        stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
-
-        stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
-
-        stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
-        stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
-        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
-
         t->didPairlistH2D[i]  = false;
         t->didPrune[i]        = false;
         t->didRollingPrune[i] = false;
@@ -500,7 +460,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
 
     nb->bUseTwoStreams = bLocalAndNonlocal;
 
-    snew(nb->timers, 1);
+    nb->timers = new cu_timers_t();
     snew(nb->timings, 1);
 
     /* init nbst */
@@ -578,7 +538,6 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
     }
 
     char          sbuf[STRLEN];
-    cudaError_t   stat;
     bool          bDoTime    = nb->bDoTime;
     cudaStream_t  stream     = nb->stream[iloc];
     cu_plist_t   *d_plist    = nb->plist[iloc];
@@ -599,8 +558,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
         nb->timers->didPairlistH2D[iloc] = true;
     }
 
@@ -627,8 +585,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
     }
 
     /* the next use of thist list we be the first one, so we need to prune */
@@ -704,8 +661,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
     if (bDoTime)
     {
         /* time async copy */
-        stat = cudaEventRecord(timers->start_atdat, ls);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        timers->atdat.openTimingRegion(ls);
     }
 
     /* need to reallocate if we have to copy more atoms than the amount of space
@@ -764,8 +720,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 
     if (bDoTime)
     {
-        stat = cudaEventRecord(timers->stop_atdat, ls);
-        CU_RET_ERR(stat, "cudaEventRecord failed");
+        timers->atdat.closeTimingRegion(ls);
     }
 }
 
@@ -785,7 +740,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     cu_atomdata_t   *atdat;
     cu_nbparam_t    *nbparam;
     cu_plist_t      *plist, *plist_nl;
-    cu_timers_t     *timers;
 
     if (nb == NULL)
     {
@@ -796,7 +750,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     nbparam     = nb->nbparam;
     plist       = nb->plist[eintLocal];
     plist_nl    = nb->plist[eintNonlocal];
-    timers      = nb->timers;
 
     nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 
@@ -805,48 +758,14 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
 
+    delete nb->timers;
     if (nb->bDoTime)
     {
-        stat = cudaEventDestroy(timers->start_atdat);
-        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
-        stat = cudaEventDestroy(timers->stop_atdat);
-        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
-
         /* The non-local counters/stream (second in the array) are needed only with DD. */
         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
         {
-            stat = cudaEventDestroy(timers->start_nb_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
-            stat = cudaEventDestroy(timers->stop_nb_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
-
-            stat = cudaEventDestroy(timers->start_prune_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_prune_k");
-            stat = cudaEventDestroy(timers->stop_prune_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_prune_k");
-
-            stat = cudaEventDestroy(timers->start_rollingPrune_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_rollingPrune_k");
-            stat = cudaEventDestroy(timers->stop_rollingPrune_k[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_rollingPrune_k");
-
-            stat = cudaEventDestroy(timers->start_pl_h2d[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
-            stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
-
             stat = cudaStreamDestroy(nb->stream[i]);
             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
-
-            stat = cudaEventDestroy(timers->start_nb_h2d[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
-            stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
-
-            stat = cudaEventDestroy(timers->start_nb_d2h[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
-            stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
-            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
         }
     }
 
@@ -897,7 +816,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     {
         sfree(plist_nl);
     }
-    sfree(timers);
     sfree(nb->timings);
     sfree(nb);
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index bbd283d075..ceb3a3dafd 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -48,6 +48,7 @@
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpuregiontimer.cuh"
 #include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/mdtypes/interaction_const.h"
@@ -228,30 +229,23 @@ struct cu_plist
 };
 
 /** \internal
- * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers.
+ * \brief CUDA timers used for timing GPU kernels and H2D/D2H transfers.
  *
  * The two-sized arrays hold the local and non-local values and should always
  * be indexed with eintLocal/eintNonlocal.
  */
 struct cu_timers
 {
-    cudaEvent_t start_atdat;             /**< start event for atom data transfer (every PS step)             */
-    cudaEvent_t stop_atdat;              /**< stop event for atom data transfer (every PS step)              */
-    cudaEvent_t start_nb_h2d[2];         /**< start events for x/q H2D transfers (l/nl, every step)          */
-    cudaEvent_t stop_nb_h2d[2];          /**< stop events for x/q H2D transfers (l/nl, every step)           */
-    cudaEvent_t start_nb_d2h[2];         /**< start events for f D2H transfer (l/nl, every step)             */
-    cudaEvent_t stop_nb_d2h[2];          /**< stop events for f D2H transfer (l/nl, every step)              */
-    cudaEvent_t start_pl_h2d[2];         /**< start events for pair-list H2D transfers (l/nl, every PS step) */
-    cudaEvent_t stop_pl_h2d[2];          /**< start events for pair-list H2D transfers (l/nl, every PS step) */
-    bool        didPairlistH2D[2];       /**< true when a pair-list transfer has been done at this step      */
-    cudaEvent_t start_nb_k[2];           /**< start event for non-bonded kernels (l/nl, every step)          */
-    cudaEvent_t stop_nb_k[2];            /**< stop event non-bonded kernels (l/nl, every step)               */
-    cudaEvent_t start_prune_k[2];        /**< start event for the 1st pass list pruning kernel (l/nl, every PS step)   */
-    cudaEvent_t stop_prune_k[2];         /**< stop event for the 1st pass list pruning kernel (l/nl, every PS step)   */
-    bool        didPrune[2];             /**< true when we timed pruning and the timings need to be accounted for */
-    cudaEvent_t start_rollingPrune_k[2]; /**< start event for rolling pruning kernels (l/nl, frequency depends on chunk size)   */
-    cudaEvent_t stop_rollingPrune_k[2];  /**< stop event for rolling pruning kernels (l/nl, frequency depends on chunk size)   */
-    bool        didRollingPrune[2];      /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+    GpuRegionTimer atdat;              /**< timer for atom data transfer (every PS step)            */
+    GpuRegionTimer nb_h2d[2];          /**< timer for x/q H2D transfers (l/nl, every step)          */
+    GpuRegionTimer nb_d2h[2];          /**< timer for f D2H transfer (l/nl, every step)             */
+    GpuRegionTimer pl_h2d[2];          /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+    bool           didPairlistH2D[2];  /**< true when a pair-list transfer has been done at this step */
+    GpuRegionTimer nb_k[2];            /**< timer for non-bonded kernels (l/nl, every step)         */
+    GpuRegionTimer prune_k[2];         /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
+    bool           didPrune[2];        /**< true when we timed pruning and the timings need to be accounted for */
+    GpuRegionTimer rollingPrune_k[2];  /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
+    bool           didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
 };
 
 /** \internal
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 3b37b32a81..813e0bdcfb 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -376,8 +376,10 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
  * calling clFinish or clWaitForEvents.
  * The function returns 0.0 if the input event, *ocl_event, is 0.
  * Don't use this function when more than one wait will be issued for the event.
+ * \todo This function, as well as some CUDA counterparts, is superseded by GpuRegionTimer.
+ * Delete.
  */
-static double ocl_event_elapsed_ms(cl_event *ocl_event)
+static inline double ocl_event_elapsed_ms(cl_event *ocl_event)
 {
     cl_int gmx_unused cl_error;
     cl_ulong          start_ns, end_ns;
@@ -484,10 +486,19 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     }
 
     /* beginning of timed HtoD section */
+    if (bDoTime)
+    {
+        t->nb_h2d[iloc].openTimingRegion(stream);
+    }
 
     /* HtoD x, q */
     ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
-                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
+
+    if (bDoTime)
+    {
+        t->nb_h2d[iloc].closeTimingRegion(stream);
+    }
 
     /* When we get here all misc operations issues in the local stream as well as
        the local xq H2D are done,
@@ -534,6 +545,10 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     }
 
     /* beginning of timed nonbonded calculation section */
+    if (bDoTime)
+    {
+        t->nb_k[iloc].openTimingRegion(stream);
+    }
 
     /* get the pointer to the kernel flavor we need to use */
     nb_kernel = select_nbnxn_kernel(nb,
@@ -625,9 +640,14 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     {
         printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
     }
-    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
+    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
     assert(cl_error == CL_SUCCESS);
 
+    if (bDoTime)
+    {
+        t->nb_k[iloc].closeTimingRegion(stream);
+    }
+
 #ifdef DEBUG_OCL
     {
         static int run_step = 1;
@@ -763,6 +783,18 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
         return;
     }
 
+    GpuRegionTimer *timer = nullptr;
+    if (bDoTime)
+    {
+        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+    }
+
+    /* beginning of timed prune calculation section */
+    if (bDoTime)
+    {
+        timer->openTimingRegion(stream);
+    }
+
     /* Kernel launch config:
      * - The thread block dimensions match the size of i-clusters, j-clusters,
      *   and j-cluster concurrency, in x, y, and z, respectively.
@@ -812,15 +844,9 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
     cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
     assert(cl_error == CL_SUCCESS);
 
-    cl_event *pruneEventPtr = nullptr;
-    if (bDoTime)
-    {
-        pruneEventPtr = plist->haveFreshList ? &t->prune_k[iloc] : &t->rollingPrune_k[iloc];
-    }
-
     cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
                                       nullptr, global_work_size, local_work_size,
-                                      0, nullptr, pruneEventPtr);
+                                      0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
     GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
 
     if (plist->haveFreshList)
@@ -834,6 +860,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
         /* Mark that rolling pruning has been done */
         nb->timers->didRollingPrune[iloc] = true;
     }
+
+    if (bDoTime)
+    {
+        timer->closeTimingRegion(stream);
+    }
 }
 
 /*! \brief
@@ -904,6 +935,10 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
     }
 
     /* beginning of timed D2H section */
+    if (bDoTime)
+    {
+        t->nb_d2h[iloc].openTimingRegion(stream);
+    }
 
     /* With DD the local D2H transfer can only start after the non-local
        has been launched. */
@@ -914,7 +949,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 
     /* DtoH f */
     ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
-                       (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
+                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
 
     /* kick off work */
     cl_error = clFlush(stream);
@@ -942,19 +977,24 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
         if (bCalcFshift)
         {
             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
-                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
+                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
         if (bCalcEner)
         {
             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
-                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
+                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
 
             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
-                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
+                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
         }
     }
+
+    if (bDoTime)
+    {
+        t->nb_d2h[iloc].closeTimingRegion(stream);
+    }
 }
 
 /*! \brief Count pruning kernel time if either kernel has been triggered
@@ -983,13 +1023,13 @@ static void countPruneKernelTime(cl_timers_t         *timers,
     if (timers->didPrune[iloc])
     {
         timings->pruneTime.c++;
-        timings->pruneTime.t += ocl_event_elapsed_ms(&timers->prune_k[iloc]);
+        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
     }
 
     if (timers->didRollingPrune[iloc])
     {
         timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += ocl_event_elapsed_ms(&timers->rollingPrune_k[iloc]);
+        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
     }
 }
 
@@ -1057,16 +1097,11 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
             }
 
             /* kernel timings */
-
-            timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
-                ocl_event_elapsed_ms(timers->nb_k + iloc);
+            timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime();
 
             /* X/q H2D and F D2H timings */
-            timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
-            timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
-            timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
-            timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
-            timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
+            timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime();
+            timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime();
 
             /* Count the pruning kernel times for both cases:1st pass (at search step)
                and rolling pruning (if called at the previous step).
@@ -1083,13 +1118,10 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
                 if (LOCAL_A(aloc))
                 {
                     timings->pl_h2d_c++;
-                    timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
+                    timings->pl_h2d_t += timers->atdat.getLastRangeTime();
                 }
 
-                timings->pl_h2d_t +=
-                    ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
-                    ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
-                    ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
+                timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime();
 
                 /* Clear the timing flag for the next step */
                 timers->didPairlistH2D[iloc] = false;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index 399ea68b82..6dd8e4f669 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -746,7 +746,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 
     nb->bUseTwoStreams = bLocalAndNonlocal;
 
-    snew(nb->timers, 1);
+    nb->timers = new cl_timers_t();
     snew(nb->timings, 1);
 
     /* set device info, just point it to the right GPU among the detected ones */
@@ -907,6 +907,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
     }
 
     char             sbuf[STRLEN];
+    bool             bDoTime    = nb->bDoTime;
     cl_command_queue stream     = nb->stream[iloc];
     cl_plist_t      *d_plist    = nb->plist[iloc];
 
@@ -924,8 +925,9 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
         }
     }
 
-    if (nb->bDoTime)
+    if (bDoTime)
     {
+        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
         nb->timers->didPairlistH2D[iloc] = true;
     }
 
@@ -933,26 +935,31 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
                          &d_plist->nsci, &d_plist->sci_nalloc,
                          h_plist->nsci,
                          nb->dev_rundata->context,
-                         stream, true, &(nb->timers->pl_h2d_sci[iloc]));
+                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 
     ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
                          &d_plist->ncj4, &d_plist->cj4_nalloc,
                          h_plist->ncj4,
                          nb->dev_rundata->context,
-                         stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
+                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
 
-    /* this call only allocates space on the device (no data is transferred) */
+    /* this call only allocates space on the device (no data is transferred) - no timing as well! */
     ocl_realloc_buffered(&d_plist->imask, NULL, sizeof(unsigned int),
                          &d_plist->nimask, &d_plist->imask_nalloc,
                          h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
                          nb->dev_rundata->context,
-                         stream, true, &(nb->timers->pl_h2d_imask[iloc]));
+                         stream, true);
 
     ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
                          &d_plist->nexcl, &d_plist->excl_nalloc,
                          h_plist->nexcl,
                          nb->dev_rundata->context,
-                         stream, true, &(nb->timers->pl_h2d_excl[iloc]));
+                         stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+    if (bDoTime)
+    {
+        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+    }
 
     /* need to prune the pair list during the next step */
     d_plist->haveFreshList = true;
@@ -989,6 +996,12 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
     natoms    = nbat->natoms;
     realloced = false;
 
+    if (bDoTime)
+    {
+        /* time async copy */
+        timers->atdat.openTimingRegion(ls);
+    }
+
     /* need to reallocate if we have to copy more atoms than the amount of space
        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
     if (natoms > d_atdat->nalloc)
@@ -1046,15 +1059,20 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
     if (useLjCombRule(nb->nbparam->vdwtype))
     {
         ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0,
-                           natoms*sizeof(cl_float2), ls, bDoTime ? &(timers->atdat) : NULL);
+                           natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
     else
     {
         ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
-                           natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
+                           natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 
     }
 
+    if (bDoTime)
+    {
+        timers->atdat.closeTimingRegion(ls);
+    }
+
     /* kick off the tasks enqueued above to ensure concurrency with the search */
     cl_error = clFlush(ls);
     assert(CL_SUCCESS == cl_error);
@@ -1211,7 +1229,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     sfree(nb->dev_rundata);
 
     /* Free timers and timings */
-    sfree(nb->timers);
+    delete nb->timers;
     sfree(nb->timings);
     sfree(nb);
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index 81b2df83b0..74e98db9e3 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -55,6 +55,7 @@
 #    include <CL/opencl.h>
 #endif
 
+#include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/mdtypes/interaction_const.h"
@@ -297,7 +298,6 @@ typedef struct cl_plist
     int              rollingPruningPart;     /**< the next part to which the roling pruning needs to be applied */
 }cl_plist_t;
 
-
 /*! \internal
  * \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers.
  *
@@ -306,27 +306,16 @@ typedef struct cl_plist
  */
 typedef struct cl_timers
 {
-    cl_event atdat;              /**< event for atom data transfer (every PS step)                 */
-
-    cl_event nb_h2d[2];          /**< events for x/q H2D transfers (l/nl, every step)              */
-
-    cl_event nb_d2h_f[2];        /**< events for f D2H transfer (l/nl, every step)                 */
-    cl_event nb_d2h_fshift[2];   /**< events for fshift D2H transfer (l/nl, every step)            */
-    cl_event nb_d2h_e_el[2];     /**< events for e_el D2H transfer (l/nl, every step)              */
-    cl_event nb_d2h_e_lj[2];     /**< events for e_lj D2H transfer (l/nl, every step)              */
-
-    cl_event pl_h2d_sci[2];      /**< events for pair-list sci H2D transfers (l/nl, every PS step) */
-    cl_event pl_h2d_cj4[2];      /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */
-    cl_event pl_h2d_excl[2];     /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/
-    cl_event pl_h2d_imask[2];    /**< events for pair-list imask H2D transfers (l/nl, every PS step)*/
-
-    cl_event nb_k[2];            /**< event for non-bonded kernels (l/nl, every step)              */
-
-    bool     didPairlistH2D[2];  /**< tells if we timed a pair-list transfer */
-    cl_event prune_k[2];         /**< event for pruning kernel (every prune step) */
-    bool     didPrune[2];        /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */
-    cl_event rollingPrune_k[2];  /**< event for rolling pruning kernel (every prune step) */
-    bool     didRollingPrune[2]; /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */
+    GpuRegionTimer atdat;              /**< timer for atom data transfer (every PS step)            */
+    GpuRegionTimer nb_h2d[2];          /**< timer for x/q H2D transfers (l/nl, every step)          */
+    GpuRegionTimer nb_d2h[2];          /**< timer for f D2H transfer (l/nl, every step)             */
+    GpuRegionTimer pl_h2d[2];          /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+    bool           didPairlistH2D[2];  /**< true when a pair-list transfer has been done at this step */
+    GpuRegionTimer nb_k[2];            /**< timer for non-bonded kernels (l/nl, every step)         */
+    GpuRegionTimer prune_k[2];         /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
+    bool           didPrune[2];        /**< true when we timed pruning and the timings need to be accounted for */
+    GpuRegionTimer rollingPrune_k[2];  /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
+    bool           didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
 } cl_timers_t;
 
 /*! \internal
-- 
2.22.0