--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Implements the GPU region timer for CUDA.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ * \inlibraryapi
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_CUH
+#define GMX_GPU_UTILS_GPUREGIONTIMER_CUH
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+
+#include "gpuregiontimer.h"
+
+template <> struct GpuTraits<GpuFramework::CUDA>
+{
+ using CommandStream = cudaStream_t;
+ using CommandEvent = void;
+};
+
+//! Short-hand for external use
+using GpuRegionTimer = GpuRegionTimerWrapper<GpuFramework::CUDA>;
+
+template <> class GpuRegionTimerImpl<GpuFramework::CUDA>
+{
+ //! Short-hand
+ using CommandStream = typename GpuTraits<GpuFramework::CUDA>::CommandStream;
+ //! The underlying timing event pair - the beginning and the end of the timespan
+ cudaEvent_t eventStart_, eventStop_;
+
+ public:
+
+ GpuRegionTimerImpl()
+ {
+ const int eventFlags = cudaEventDefault;
+ CU_RET_ERR(cudaEventCreate(&eventStart_, eventFlags), "GPU timing creation failure");
+ CU_RET_ERR(cudaEventCreate(&eventStop_, eventFlags), "GPU timing creation failure");
+ }
+
+ ~GpuRegionTimerImpl()
+ {
+ CU_RET_ERR(cudaEventDestroy(eventStart_), "GPU timing destruction failure");
+ CU_RET_ERR(cudaEventDestroy(eventStop_), "GPU timing destruction failure");
+ }
+
+ inline void openTimingRegion(CommandStream s)
+ {
+ CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure");
+ }
+
+ inline void closeTimingRegion(CommandStream s)
+ {
+ CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure");
+ }
+
+ inline double getLastRangeTime()
+ {
+ float milliseconds = 0.0;
+ CU_RET_ERR(cudaEventElapsedTime(&milliseconds, eventStart_, eventStop_), "GPU timing update failure");
+ reset();
+ return milliseconds;
+ }
+
+ inline void reset(){}
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines the GPU region timer implementation/wrapper classes.
+ * The implementations live in gpuregiontimer.cuh for CUDA and gpuregiontimer_ocl.h for OpenCL.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_H
+#define GMX_GPU_UTILS_GPUREGIONTIMER_H
+
+#include <string>
+
+#include "gromacs/utility/gmxassert.h"
+
+//! Debug GPU timers in debug builds only
+#if defined(NDEBUG)
+static const bool c_debugTimerState = false;
+#else
+static const bool c_debugTimerState = true;
+#endif
+
+/*! \libinternal \brief
+ * Enumeration of possible GPU build-paths.
+ * \todo Move somewhere general?
+ */
+enum class GpuFramework
+{
+ CUDA,
+ OpenCL
+};
+
+/*! \libinternal \brief
+ * GPU build-path traits such as types.
+ * \todo Move somewhere general?
+ */
+template <GpuFramework> struct GpuTraits
+{
+ using CommandStream = void; //!< GPU command stream
+ using CommandEvent = void; //!< Single command call timing event - used in OpenCL
+};
+
+/*! \libinternal \brief
+ * This is a GPU region timing implementation interface.
+ * It should provide methods for measuring the last timespan.
+ */
+template <GpuFramework framework> class GpuRegionTimerImpl
+{
+ //! Short-hands
+ using CommandStream = typename GpuTraits<framework>::CommandStream;
+ using CommandEvent = typename GpuTraits<framework>::CommandEvent;
+
+ public:
+ GpuRegionTimerImpl() = default;
+ ~GpuRegionTimerImpl() = default;
+
+ /*! \brief Will be called before the region start. */
+ inline void openTimingRegion(CommandStream) = 0;
+ /*! \brief Will be called after the region end. */
+ inline void closeTimingRegion(CommandStream) = 0;
+ /*! \brief Resets any internal state if needed */
+ inline void reset() = 0;
+ /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */
+ inline double getLastRangeTime() = 0;
+ /*! \brief Returns a new raw timing event
+ * for passing into individual GPU API calls
+ * within the region if the API requires it (e.g. on OpenCL). */
+ inline CommandEvent *fetchNextEvent() = 0;
+};
+
+/*! \libinternal \brief
+ * This is a GPU region timing wrapper class.
+ * It allows for host-side tracking of the execution timespans in GPU code
+ * (measuring kernel or transfers duration).
+ * It also partially tracks the correctness of the timer state transitions,
+ * as far as current implementation allows (see TODO in getLastRangeTime() for a disabled check).
+ * Internally it uses GpuRegionTimerImpl for measuring regions.
+ */
+template <GpuFramework framework> class GpuRegionTimerWrapper
+{
+ //! Short-hands
+ using CommandStream = typename GpuTraits<framework>::CommandStream;
+ using CommandEvent = typename GpuTraits<framework>::CommandEvent;
+ //! The timer state used for debug-only assertions
+ enum class TimerState
+ {
+ Idle,
+ Recording,
+ Stopped
+ } debugState_;
+
+ //! The underlying region timer implementation
+ GpuRegionTimerImpl<framework> impl_;
+
+ public:
+ GpuRegionTimerWrapper() : impl_()
+ {
+ reset();
+ }
+ ~GpuRegionTimerWrapper() = default;
+ /*! \brief
+ * To be called before the region start.
+ *
+ * \param[in] s The GPU command stream where the event being measured takes place.
+ */
+ void openTimingRegion(CommandStream s)
+ {
+ if (c_debugTimerState)
+ {
+ std::string error = "GPU timer should be idle, but is " + std::string((debugState_ == TimerState::Stopped) ? "stopped" : "recording") + ".";
+ GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str());
+ debugState_ = TimerState::Recording;
+ }
+ impl_.openTimingRegion(s);
+ }
+ /*! \brief
+ * To be called after the region end.
+ *
+ * \param[in] s The GPU command stream where the event being measured takes place.
+ */
+ void closeTimingRegion(CommandStream s)
+ {
+ if (c_debugTimerState)
+ {
+ std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + ".";
+ GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str());
+ debugState_ = TimerState::Stopped;
+ }
+ impl_.closeTimingRegion(s);
+ }
+ /*! \brief
+ * Returns the last timespan, and resets the internal timer state.
+ * To be called after closeTimingRegion() and the command stream of the event having been synchronized.
+ * \returns The last timespan (in milliseconds).
+ */
+ double getLastRangeTime()
+ {
+ if (c_debugTimerState)
+ {
+ /* The assertion below is commented because it is currently triggered on a special case:
+ * the early return before the local non-bonded kernel launch if there is nothing to do.
+ * This can be reproduced in OpenCL build by running
+ * mdrun-mpi-test -ntmpi 2 --gtest_filter=*Empty*
+ * Similarly, the GpuRegionTimerImpl suffers from non-nullptr
+ * cl_event conditionals which ideally should only be asserts.
+ * TODO: improve internal task scheduling, re-enable the assert, turn conditionals into asserts
+ */
+ /*
+ std::string error = "GPU timer should be stopped, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "recording") + ".";
+ GMX_ASSERT(debugState_ == TimerState::Stopped, error.c_str());
+ */
+ debugState_ = TimerState::Idle;
+ }
+ double milliseconds = impl_.getLastRangeTime();
+ return milliseconds;
+ }
+ /*! \brief Resets the implementation and total time/call count to zeroes. */
+ void reset()
+ {
+ if (c_debugTimerState)
+ {
+ debugState_ = TimerState::Idle;
+ }
+ impl_.reset();
+ }
+ /*! \brief
+ * Gets a pointer to a new timing event for passing into individual GPU API calls
+ * within the region if they require it (e.g. on OpenCL).
+ * \returns The pointer to the underlying single command timing event.
+ */
+ CommandEvent *fetchNextEvent()
+ {
+ if (c_debugTimerState)
+ {
+ std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + ".";
+ GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str());
+ }
+ return impl_.fetchNextEvent();
+ }
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Implements the GPU region timer for OpenCL.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *
+ * \inlibraryapi
+ */
+
+#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H
+#define GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H
+
+#include <array>
+
+#include "gromacs/gpu_utils/oclutils.h"
+
+#include "gpuregiontimer.h"
+
+template <> struct GpuTraits<GpuFramework::OpenCL>
+{
+ using CommandStream = cl_command_queue;
+ using CommandEvent = cl_event;
+};
+
+//! Short-hand for external use
+using GpuRegionTimer = GpuRegionTimerWrapper<GpuFramework::OpenCL>;
+
+// cppcheck-suppress noConstructor
+template <> class GpuRegionTimerImpl<GpuFramework::OpenCL>
+{
+ //! Short-hands
+ using CommandStream = typename GpuTraits<GpuFramework::OpenCL>::CommandStream;
+ using CommandEvent = typename GpuTraits<GpuFramework::OpenCL>::CommandEvent;
+
+ /*! \brief The underlying individual timing events array.
+ * The maximum size is chosen arbitrarily to work with current code, and can be changed.
+ * There is simply no need for run-time resizing, and it's unlikely we'll ever need more than 10.
+ */
+ std::array<cl_event, 10> events_ = {{nullptr}};
+ //! Index of the active event
+ size_t currentEvent_ = 0;
+
+ public:
+ GpuRegionTimerImpl() = default;
+ ~GpuRegionTimerImpl() = default;
+ inline void openTimingRegion(CommandStream){}
+ inline void closeTimingRegion(CommandStream){}
+
+ inline double getLastRangeTime()
+ {
+ double milliseconds = 0.0;
+ for (size_t i = 0; i < currentEvent_; i++)
+ {
+ if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass
+ {
+ cl_ulong start_ns, end_ns;
+ cl_int gmx_unused cl_error;
+
+ cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_START,
+ sizeof(cl_ulong), &start_ns, nullptr);
+ GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure");
+ cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_END,
+ sizeof(cl_ulong), &end_ns, nullptr);
+ GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure");
+ milliseconds += (end_ns - start_ns) / 1000000.0;
+ }
+ }
+ reset();
+ return milliseconds;
+ }
+
+ inline void reset()
+ {
+ for (size_t i = 0; i < currentEvent_; i++)
+ {
+ if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass
+ {
+ GMX_ASSERT(CL_SUCCESS == clReleaseEvent(events_[i]), "OpenCL event release failure");
+ }
+ }
+ currentEvent_ = 0;
+ // As long as we're doing nullptr checks, we might want to be extra cautious.
+ events_.fill(nullptr);
+ }
+
+ inline CommandEvent *fetchNextEvent()
+ {
+ GMX_ASSERT(currentEvent_ < events_.size(), "Increase c_maxEventNumber_ if needed");
+ cl_event *result = &events_[currentEvent_];
+ currentEvent_++;
+ return result;
+ }
+};
+
+#endif
/* beginning of timed HtoD section */
if (bDoTime)
{
- stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_h2d[iloc].openTimingRegion(stream);
}
/* HtoD x, q */
if (bDoTime)
{
- stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_h2d[iloc].closeTimingRegion(stream);
}
/* When we get here all misc operations issues in the local stream as well as
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- stat = cudaEventRecord(t->start_nb_k[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_k[iloc].openTimingRegion(stream);
}
/* get the pointer to the kernel flavor we need to use */
if (bDoTime)
{
- stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_k[iloc].closeTimingRegion(stream);
}
#if (defined(WIN32) || defined( _WIN32 ))
int iloc,
int numParts)
{
- cudaError_t stat;
-
cu_atomdata_t *adat = nb->atdat;
cu_nbparam_t *nbp = nb->nbparam;
cu_plist_t *plist = nb->plist[iloc];
return;
}
- cudaEvent_t startEvent, stopEvent;
+ GpuRegionTimer *timer = nullptr;
if (bDoTime)
{
- startEvent = (plist->haveFreshList ? t->start_prune_k[iloc] : t->start_rollingPrune_k[iloc]);
- stopEvent = (plist->haveFreshList ? t->stop_prune_k[iloc] : t->stop_rollingPrune_k[iloc]);
+ timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
}
/* beginning of timed prune calculation section */
if (bDoTime)
{
- stat = cudaEventRecord(startEvent, stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ timer->openTimingRegion(stream);
}
/* Kernel launch config:
if (bDoTime)
{
- stat = cudaEventRecord(stopEvent, stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ timer->closeTimingRegion(stream);
}
#if (defined(WIN32) || defined( _WIN32 ))
/* beginning of timed D2H section */
if (bDoTime)
{
- stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_d2h[iloc].openTimingRegion(stream);
}
/* With DD the local D2H transfer can only start after the non-local
if (bDoTime)
{
- stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ t->nb_d2h[iloc].closeTimingRegion(stream);
}
}
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
-static void countPruneKernelTime(const cu_timers_t *timers,
+static void countPruneKernelTime(cu_timers_t *timers,
gmx_wallclock_gpu_t *timings,
const int iloc)
{
if (timers->didPrune[iloc])
{
timings->pruneTime.c++;
- timings->pruneTime.t += cu_event_elapsed(timers->start_prune_k[iloc],
- timers->stop_prune_k[iloc]);
+ timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
}
if (timers->didRollingPrune[iloc])
{
timings->dynamicPruneTime.c++;
- timings->dynamicPruneTime.t += cu_event_elapsed(timers->start_rollingPrune_k[iloc],
- timers->stop_rollingPrune_k[iloc]);
+ timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
}
}
}
/* kernel timings */
- timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
- cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
+ timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime();
/* X/q H2D and F D2H timings */
- timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
- timers->stop_nb_h2d[iloc]);
- timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
- timers->stop_nb_d2h[iloc]);
+ timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime();
+ timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime();
/* Count the pruning kernel times for both cases:1st pass (at search step)
and rolling pruning (if called at the previous step).
if (LOCAL_A(aloc))
{
timings->pl_h2d_c++;
- timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
- timers->stop_atdat);
+ timings->pl_h2d_t += timers->atdat.getLastRangeTime();
}
- timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
- timers->stop_pl_h2d[iloc]);
+ timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime();
/* Clear the timing flag for the next step */
timers->didPairlistH2D[iloc] = false;
#include "nbnxn_cuda.h"
#include "nbnxn_cuda_types.h"
-static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
-
/* This is a heuristically determined parameter for the Fermi, Kepler
* and Maxwell architectures for the minimum size of ci lists by multiplying
* this constant with the # of multiprocessors on the current device.
/*! Initializes the timer data structure. */
static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
{
- cudaError_t stat;
- int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
-
- stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
- stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
-
/* The non-local counters/stream (second in the array) are needed only with DD. */
for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
{
- stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
- stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
-
- stat = cudaEventCreateWithFlags(&(t->start_prune_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_prune_k failed");
- stat = cudaEventCreateWithFlags(&(t->stop_prune_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_prune_k failed");
-
- stat = cudaEventCreateWithFlags(&(t->start_rollingPrune_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_rollingPrune_k failed");
- stat = cudaEventCreateWithFlags(&(t->stop_rollingPrune_k[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_rollingPrune_k failed");
-
- stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
- stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
-
- stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
- stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
-
- stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
- stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
- CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
-
t->didPairlistH2D[i] = false;
t->didPrune[i] = false;
t->didRollingPrune[i] = false;
nb->bUseTwoStreams = bLocalAndNonlocal;
- snew(nb->timers, 1);
+ nb->timers = new cu_timers_t();
snew(nb->timings, 1);
/* init nbst */
}
char sbuf[STRLEN];
- cudaError_t stat;
bool bDoTime = nb->bDoTime;
cudaStream_t stream = nb->stream[iloc];
cu_plist_t *d_plist = nb->plist[iloc];
if (bDoTime)
{
- stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ nb->timers->pl_h2d[iloc].openTimingRegion(stream);
nb->timers->didPairlistH2D[iloc] = true;
}
if (bDoTime)
{
- stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
}
/* the next use of thist list we be the first one, so we need to prune */
if (bDoTime)
{
/* time async copy */
- stat = cudaEventRecord(timers->start_atdat, ls);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ timers->atdat.openTimingRegion(ls);
}
/* need to reallocate if we have to copy more atoms than the amount of space
if (bDoTime)
{
- stat = cudaEventRecord(timers->stop_atdat, ls);
- CU_RET_ERR(stat, "cudaEventRecord failed");
+ timers->atdat.closeTimingRegion(ls);
}
}
cu_atomdata_t *atdat;
cu_nbparam_t *nbparam;
cu_plist_t *plist, *plist_nl;
- cu_timers_t *timers;
if (nb == NULL)
{
nbparam = nb->nbparam;
plist = nb->plist[eintLocal];
plist_nl = nb->plist[eintNonlocal];
- timers = nb->timers;
nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
+ delete nb->timers;
if (nb->bDoTime)
{
- stat = cudaEventDestroy(timers->start_atdat);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
- stat = cudaEventDestroy(timers->stop_atdat);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
-
/* The non-local counters/stream (second in the array) are needed only with DD. */
for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
{
- stat = cudaEventDestroy(timers->start_nb_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
- stat = cudaEventDestroy(timers->stop_nb_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
-
- stat = cudaEventDestroy(timers->start_prune_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_prune_k");
- stat = cudaEventDestroy(timers->stop_prune_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_prune_k");
-
- stat = cudaEventDestroy(timers->start_rollingPrune_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_rollingPrune_k");
- stat = cudaEventDestroy(timers->stop_rollingPrune_k[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_rollingPrune_k");
-
- stat = cudaEventDestroy(timers->start_pl_h2d[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
- stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
-
stat = cudaStreamDestroy(nb->stream[i]);
CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
-
- stat = cudaEventDestroy(timers->start_nb_h2d[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
- stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
-
- stat = cudaEventDestroy(timers->start_nb_d2h[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
- stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
}
}
{
sfree(plist_nl);
}
- sfree(timers);
sfree(nb->timings);
sfree(nb);
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpuregiontimer.cuh"
#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/mdtypes/interaction_const.h"
};
/** \internal
- * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers.
+ * \brief CUDA timers used for timing GPU kernels and H2D/D2H transfers.
*
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
struct cu_timers
{
- cudaEvent_t start_atdat; /**< start event for atom data transfer (every PS step) */
- cudaEvent_t stop_atdat; /**< stop event for atom data transfer (every PS step) */
- cudaEvent_t start_nb_h2d[2]; /**< start events for x/q H2D transfers (l/nl, every step) */
- cudaEvent_t stop_nb_h2d[2]; /**< stop events for x/q H2D transfers (l/nl, every step) */
- cudaEvent_t start_nb_d2h[2]; /**< start events for f D2H transfer (l/nl, every step) */
- cudaEvent_t stop_nb_d2h[2]; /**< stop events for f D2H transfer (l/nl, every step) */
- cudaEvent_t start_pl_h2d[2]; /**< start events for pair-list H2D transfers (l/nl, every PS step) */
- cudaEvent_t stop_pl_h2d[2]; /**< start events for pair-list H2D transfers (l/nl, every PS step) */
- bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */
- cudaEvent_t start_nb_k[2]; /**< start event for non-bonded kernels (l/nl, every step) */
- cudaEvent_t stop_nb_k[2]; /**< stop event non-bonded kernels (l/nl, every step) */
- cudaEvent_t start_prune_k[2]; /**< start event for the 1st pass list pruning kernel (l/nl, every PS step) */
- cudaEvent_t stop_prune_k[2]; /**< stop event for the 1st pass list pruning kernel (l/nl, every PS step) */
- bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */
- cudaEvent_t start_rollingPrune_k[2]; /**< start event for rolling pruning kernels (l/nl, frequency depends on chunk size) */
- cudaEvent_t stop_rollingPrune_k[2]; /**< stop event for rolling pruning kernels (l/nl, frequency depends on chunk size) */
- bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+ GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */
+ GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */
+ GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */
+ GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+ bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */
+ GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */
+ GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */
+ bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */
+ GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */
+ bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
};
/** \internal
* calling clFinish or clWaitForEvents.
* The function returns 0.0 if the input event, *ocl_event, is 0.
* Don't use this function when more than one wait will be issued for the event.
+ * \todo This function, as well as some CUDA counterparts, is superseded by GpuRegionTimer.
+ * Delete.
*/
-static double ocl_event_elapsed_ms(cl_event *ocl_event)
+static inline double ocl_event_elapsed_ms(cl_event *ocl_event)
{
cl_int gmx_unused cl_error;
cl_ulong start_ns, end_ns;
}
/* beginning of timed HtoD section */
+ if (bDoTime)
+ {
+ t->nb_h2d[iloc].openTimingRegion(stream);
+ }
/* HtoD x, q */
ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
- adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+ adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ t->nb_h2d[iloc].closeTimingRegion(stream);
+ }
/* When we get here all misc operations issues in the local stream as well as
the local xq H2D are done,
}
/* beginning of timed nonbonded calculation section */
+ if (bDoTime)
+ {
+ t->nb_k[iloc].openTimingRegion(stream);
+ }
/* get the pointer to the kernel flavor we need to use */
nb_kernel = select_nbnxn_kernel(nb,
{
printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
}
- cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
+ cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
assert(cl_error == CL_SUCCESS);
+ if (bDoTime)
+ {
+ t->nb_k[iloc].closeTimingRegion(stream);
+ }
+
#ifdef DEBUG_OCL
{
static int run_step = 1;
return;
}
+ GpuRegionTimer *timer = nullptr;
+ if (bDoTime)
+ {
+ timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+ }
+
+ /* beginning of timed prune calculation section */
+ if (bDoTime)
+ {
+ timer->openTimingRegion(stream);
+ }
+
/* Kernel launch config:
* - The thread block dimensions match the size of i-clusters, j-clusters,
* and j-cluster concurrency, in x, y, and z, respectively.
cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
assert(cl_error == CL_SUCCESS);
- cl_event *pruneEventPtr = nullptr;
- if (bDoTime)
- {
- pruneEventPtr = plist->haveFreshList ? &t->prune_k[iloc] : &t->rollingPrune_k[iloc];
- }
-
cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
nullptr, global_work_size, local_work_size,
- 0, nullptr, pruneEventPtr);
+ 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
if (plist->haveFreshList)
/* Mark that rolling pruning has been done */
nb->timers->didRollingPrune[iloc] = true;
}
+
+ if (bDoTime)
+ {
+ timer->closeTimingRegion(stream);
+ }
}
/*! \brief
}
/* beginning of timed D2H section */
+ if (bDoTime)
+ {
+ t->nb_d2h[iloc].openTimingRegion(stream);
+ }
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
/* DtoH f */
ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
- (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
+ (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
/* kick off work */
cl_error = clFlush(stream);
if (bCalcFshift)
{
ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
- SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
+ SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
}
/* DtoH energies */
if (bCalcEner)
{
ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
- sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
+ sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
- sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
+ sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
}
}
+
+ if (bDoTime)
+ {
+ t->nb_d2h[iloc].closeTimingRegion(stream);
+ }
}
/*! \brief Count pruning kernel time if either kernel has been triggered
if (timers->didPrune[iloc])
{
timings->pruneTime.c++;
- timings->pruneTime.t += ocl_event_elapsed_ms(&timers->prune_k[iloc]);
+ timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
}
if (timers->didRollingPrune[iloc])
{
timings->dynamicPruneTime.c++;
- timings->dynamicPruneTime.t += ocl_event_elapsed_ms(&timers->rollingPrune_k[iloc]);
+ timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
}
}
}
/* kernel timings */
-
- timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
- ocl_event_elapsed_ms(timers->nb_k + iloc);
+ timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime();
/* X/q H2D and F D2H timings */
- timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d + iloc);
- timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f + iloc);
- timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
- timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el + iloc);
- timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj + iloc);
+ timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime();
+ timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime();
/* Count the pruning kernel times for both cases:1st pass (at search step)
and rolling pruning (if called at the previous step).
if (LOCAL_A(aloc))
{
timings->pl_h2d_c++;
- timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
+ timings->pl_h2d_t += timers->atdat.getLastRangeTime();
}
- timings->pl_h2d_t +=
- ocl_event_elapsed_ms(timers->pl_h2d_sci + iloc) +
- ocl_event_elapsed_ms(timers->pl_h2d_cj4 + iloc) +
- ocl_event_elapsed_ms(timers->pl_h2d_excl + iloc);
+ timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime();
/* Clear the timing flag for the next step */
timers->didPairlistH2D[iloc] = false;
nb->bUseTwoStreams = bLocalAndNonlocal;
- snew(nb->timers, 1);
+ nb->timers = new cl_timers_t();
snew(nb->timings, 1);
/* set device info, just point it to the right GPU among the detected ones */
}
char sbuf[STRLEN];
+ bool bDoTime = nb->bDoTime;
cl_command_queue stream = nb->stream[iloc];
cl_plist_t *d_plist = nb->plist[iloc];
}
}
- if (nb->bDoTime)
+ if (bDoTime)
{
+ nb->timers->pl_h2d[iloc].openTimingRegion(stream);
nb->timers->didPairlistH2D[iloc] = true;
}
&d_plist->nsci, &d_plist->sci_nalloc,
h_plist->nsci,
nb->dev_rundata->context,
- stream, true, &(nb->timers->pl_h2d_sci[iloc]));
+ stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
&d_plist->ncj4, &d_plist->cj4_nalloc,
h_plist->ncj4,
nb->dev_rundata->context,
- stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
+ stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
- /* this call only allocates space on the device (no data is transferred) */
+ /* this call only allocates space on the device (no data is transferred) - no timing as well! */
ocl_realloc_buffered(&d_plist->imask, NULL, sizeof(unsigned int),
&d_plist->nimask, &d_plist->imask_nalloc,
h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
nb->dev_rundata->context,
- stream, true, &(nb->timers->pl_h2d_imask[iloc]));
+ stream, true);
ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
&d_plist->nexcl, &d_plist->excl_nalloc,
h_plist->nexcl,
nb->dev_rundata->context,
- stream, true, &(nb->timers->pl_h2d_excl[iloc]));
+ stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+ }
/* need to prune the pair list during the next step */
d_plist->haveFreshList = true;
natoms = nbat->natoms;
realloced = false;
+ if (bDoTime)
+ {
+ /* time async copy */
+ timers->atdat.openTimingRegion(ls);
+ }
+
/* need to reallocate if we have to copy more atoms than the amount of space
available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
if (natoms > d_atdat->nalloc)
if (useLjCombRule(nb->nbparam->vdwtype))
{
ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0,
- natoms*sizeof(cl_float2), ls, bDoTime ? &(timers->atdat) : NULL);
+ natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
else
{
ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
- natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
+ natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
+ if (bDoTime)
+ {
+ timers->atdat.closeTimingRegion(ls);
+ }
+
/* kick off the tasks enqueued above to ensure concurrency with the search */
cl_error = clFlush(ls);
assert(CL_SUCCESS == cl_error);
sfree(nb->dev_rundata);
/* Free timers and timings */
- sfree(nb->timers);
+ delete nb->timers;
sfree(nb->timings);
sfree(nb);
# include <CL/opencl.h>
#endif
+#include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/mdtypes/interaction_const.h"
int rollingPruningPart; /**< the next part to which the roling pruning needs to be applied */
}cl_plist_t;
-
/*! \internal
* \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers.
*
*/
typedef struct cl_timers
{
- cl_event atdat; /**< event for atom data transfer (every PS step) */
-
- cl_event nb_h2d[2]; /**< events for x/q H2D transfers (l/nl, every step) */
-
- cl_event nb_d2h_f[2]; /**< events for f D2H transfer (l/nl, every step) */
- cl_event nb_d2h_fshift[2]; /**< events for fshift D2H transfer (l/nl, every step) */
- cl_event nb_d2h_e_el[2]; /**< events for e_el D2H transfer (l/nl, every step) */
- cl_event nb_d2h_e_lj[2]; /**< events for e_lj D2H transfer (l/nl, every step) */
-
- cl_event pl_h2d_sci[2]; /**< events for pair-list sci H2D transfers (l/nl, every PS step) */
- cl_event pl_h2d_cj4[2]; /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */
- cl_event pl_h2d_excl[2]; /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/
- cl_event pl_h2d_imask[2]; /**< events for pair-list imask H2D transfers (l/nl, every PS step)*/
-
- cl_event nb_k[2]; /**< event for non-bonded kernels (l/nl, every step) */
-
- bool didPairlistH2D[2]; /**< tells if we timed a pair-list transfer */
- cl_event prune_k[2]; /**< event for pruning kernel (every prune step) */
- bool didPrune[2]; /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */
- cl_event rollingPrune_k[2]; /**< event for rolling pruning kernel (every prune step) */
- bool didRollingPrune[2]; /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */
+ GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */
+ GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */
+ GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */
+ GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+ bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */
+ GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */
+ GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */
+ bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */
+ GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */
+ bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
} cl_timers_t;
/*! \internal