From 7180ba07a2fe6ae2e66879ea80ed92bec802aeaf Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Fri, 16 Jun 2017 18:04:51 +0200 Subject: [PATCH] Add GpuRegionTimer class GpuRegionTimer class is implemented in CUDA/OpenCL to help timing regions of GPU code. For CUDA, it contains pair of CUDA start/stop events; for OpenCL, it contains fixed-size array of OpenCL events, which are passed manually into OpenCL calls within region. Using this class, cu/cl_timers_t structures and the GPU timing code are made a bit more uniform. Change-Id: I8fcd7568653c5bcf3496a2bee227ae03f14a756a --- src/gromacs/gpu_utils/gpuregiontimer.cuh | 103 +++++++++ src/gromacs/gpu_utils/gpuregiontimer.h | 216 ++++++++++++++++++ src/gromacs/gpu_utils/gpuregiontimer_ocl.h | 129 +++++++++++ src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 54 ++--- .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 94 +------- .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h | 30 +-- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp | 90 +++++--- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 38 ++- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h | 33 +-- 9 files changed, 584 insertions(+), 203 deletions(-) create mode 100644 src/gromacs/gpu_utils/gpuregiontimer.cuh create mode 100644 src/gromacs/gpu_utils/gpuregiontimer.h create mode 100644 src/gromacs/gpu_utils/gpuregiontimer_ocl.h diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh new file mode 100644 index 0000000000..b709c1206e --- /dev/null +++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh @@ -0,0 +1,103 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Implements the GPU region timer for CUDA. + * + * \author Aleksei Iupinov + * + * \inlibraryapi + */ + +#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_CUH +#define GMX_GPU_UTILS_GPUREGIONTIMER_CUH + +#include "gromacs/gpu_utils/cudautils.cuh" + +#include "gpuregiontimer.h" + +template <> struct GpuTraits +{ + using CommandStream = cudaStream_t; + using CommandEvent = void; +}; + +//! Short-hand for external use +using GpuRegionTimer = GpuRegionTimerWrapper; + +template <> class GpuRegionTimerImpl +{ + //! Short-hand + using CommandStream = typename GpuTraits::CommandStream; + //! The underlying timing event pair - the beginning and the end of the timespan + cudaEvent_t eventStart_, eventStop_; + + public: + + GpuRegionTimerImpl() + { + const int eventFlags = cudaEventDefault; + CU_RET_ERR(cudaEventCreate(&eventStart_, eventFlags), "GPU timing creation failure"); + CU_RET_ERR(cudaEventCreate(&eventStop_, eventFlags), "GPU timing creation failure"); + } + + ~GpuRegionTimerImpl() + { + CU_RET_ERR(cudaEventDestroy(eventStart_), "GPU timing destruction failure"); + CU_RET_ERR(cudaEventDestroy(eventStop_), "GPU timing destruction failure"); + } + + inline void openTimingRegion(CommandStream s) + { + CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure"); + } + + inline void closeTimingRegion(CommandStream s) + { + CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure"); + } + + inline double getLastRangeTime() + { + float milliseconds = 0.0; + CU_RET_ERR(cudaEventElapsedTime(&milliseconds, eventStart_, eventStop_), "GPU timing update failure"); + reset(); + return milliseconds; + } + + inline void reset(){} +}; + +#endif diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h new file mode 100644 index 0000000000..0207f96143 --- /dev/null +++ b/src/gromacs/gpu_utils/gpuregiontimer.h @@ -0,0 +1,216 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines the GPU region timer implementation/wrapper classes. + * The implementations live in gpuregiontimer.cuh for CUDA and gpuregiontimer_ocl.h for OpenCL. + * + * \author Aleksei Iupinov + */ + +#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_H +#define GMX_GPU_UTILS_GPUREGIONTIMER_H + +#include + +#include "gromacs/utility/gmxassert.h" + +//! Debug GPU timers in debug builds only +#if defined(NDEBUG) +static const bool c_debugTimerState = false; +#else +static const bool c_debugTimerState = true; +#endif + +/*! \libinternal \brief + * Enumeration of possible GPU build-paths. + * \todo Move somewhere general? + */ +enum class GpuFramework +{ + CUDA, + OpenCL +}; + +/*! \libinternal \brief + * GPU build-path traits such as types. + * \todo Move somewhere general? + */ +template struct GpuTraits +{ + using CommandStream = void; //!< GPU command stream + using CommandEvent = void; //!< Single command call timing event - used in OpenCL +}; + +/*! \libinternal \brief + * This is a GPU region timing implementation interface. + * It should provide methods for measuring the last timespan. + */ +template class GpuRegionTimerImpl +{ + //! Short-hands + using CommandStream = typename GpuTraits::CommandStream; + using CommandEvent = typename GpuTraits::CommandEvent; + + public: + GpuRegionTimerImpl() = default; + ~GpuRegionTimerImpl() = default; + + /*! \brief Will be called before the region start. */ + inline void openTimingRegion(CommandStream) = 0; + /*! \brief Will be called after the region end. */ + inline void closeTimingRegion(CommandStream) = 0; + /*! \brief Resets any internal state if needed */ + inline void reset() = 0; + /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */ + inline double getLastRangeTime() = 0; + /*! \brief Returns a new raw timing event + * for passing into individual GPU API calls + * within the region if the API requires it (e.g. on OpenCL). */ + inline CommandEvent *fetchNextEvent() = 0; +}; + +/*! \libinternal \brief + * This is a GPU region timing wrapper class. + * It allows for host-side tracking of the execution timespans in GPU code + * (measuring kernel or transfers duration). + * It also partially tracks the correctness of the timer state transitions, + * as far as current implementation allows (see TODO in getLastRangeTime() for a disabled check). + * Internally it uses GpuRegionTimerImpl for measuring regions. + */ +template class GpuRegionTimerWrapper +{ + //! Short-hands + using CommandStream = typename GpuTraits::CommandStream; + using CommandEvent = typename GpuTraits::CommandEvent; + //! The timer state used for debug-only assertions + enum class TimerState + { + Idle, + Recording, + Stopped + } debugState_; + + //! The underlying region timer implementation + GpuRegionTimerImpl impl_; + + public: + GpuRegionTimerWrapper() : impl_() + { + reset(); + } + ~GpuRegionTimerWrapper() = default; + /*! \brief + * To be called before the region start. + * + * \param[in] s The GPU command stream where the event being measured takes place. + */ + void openTimingRegion(CommandStream s) + { + if (c_debugTimerState) + { + std::string error = "GPU timer should be idle, but is " + std::string((debugState_ == TimerState::Stopped) ? "stopped" : "recording") + "."; + GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str()); + debugState_ = TimerState::Recording; + } + impl_.openTimingRegion(s); + } + /*! \brief + * To be called after the region end. + * + * \param[in] s The GPU command stream where the event being measured takes place. + */ + void closeTimingRegion(CommandStream s) + { + if (c_debugTimerState) + { + std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + "."; + GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str()); + debugState_ = TimerState::Stopped; + } + impl_.closeTimingRegion(s); + } + /*! \brief + * Returns the last timespan, and resets the internal timer state. + * To be called after closeTimingRegion() and the command stream of the event having been synchronized. + * \returns The last timespan (in milliseconds). + */ + double getLastRangeTime() + { + if (c_debugTimerState) + { + /* The assertion below is commented because it is currently triggered on a special case: + * the early return before the local non-bonded kernel launch if there is nothing to do. + * This can be reproduced in OpenCL build by running + * mdrun-mpi-test -ntmpi 2 --gtest_filter=*Empty* + * Similarly, the GpuRegionTimerImpl suffers from non-nullptr + * cl_event conditionals which ideally should only be asserts. + * TODO: improve internal task scheduling, re-enable the assert, turn conditionals into asserts + */ + /* + std::string error = "GPU timer should be stopped, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "recording") + "."; + GMX_ASSERT(debugState_ == TimerState::Stopped, error.c_str()); + */ + debugState_ = TimerState::Idle; + } + double milliseconds = impl_.getLastRangeTime(); + return milliseconds; + } + /*! \brief Resets the implementation and total time/call count to zeroes. */ + void reset() + { + if (c_debugTimerState) + { + debugState_ = TimerState::Idle; + } + impl_.reset(); + } + /*! \brief + * Gets a pointer to a new timing event for passing into individual GPU API calls + * within the region if they require it (e.g. on OpenCL). + * \returns The pointer to the underlying single command timing event. + */ + CommandEvent *fetchNextEvent() + { + if (c_debugTimerState) + { + std::string error = "GPU timer should be recording, but is " + std::string((debugState_ == TimerState::Idle) ? "idle" : "stopped") + "."; + GMX_ASSERT(debugState_ == TimerState::Recording, error.c_str()); + } + return impl_.fetchNextEvent(); + } +}; + +#endif diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h new file mode 100644 index 0000000000..e0b4b3f87f --- /dev/null +++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h @@ -0,0 +1,129 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Implements the GPU region timer for OpenCL. + * + * \author Aleksei Iupinov + * + * \inlibraryapi + */ + +#ifndef GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H +#define GMX_GPU_UTILS_GPUREGIONTIMER_OCL_H + +#include + +#include "gromacs/gpu_utils/oclutils.h" + +#include "gpuregiontimer.h" + +template <> struct GpuTraits +{ + using CommandStream = cl_command_queue; + using CommandEvent = cl_event; +}; + +//! Short-hand for external use +using GpuRegionTimer = GpuRegionTimerWrapper; + +// cppcheck-suppress noConstructor +template <> class GpuRegionTimerImpl +{ + //! Short-hands + using CommandStream = typename GpuTraits::CommandStream; + using CommandEvent = typename GpuTraits::CommandEvent; + + /*! \brief The underlying individual timing events array. + * The maximum size is chosen arbitrarily to work with current code, and can be changed. + * There is simply no need for run-time resizing, and it's unlikely we'll ever need more than 10. + */ + std::array events_ = {{nullptr}}; + //! Index of the active event + size_t currentEvent_ = 0; + + public: + GpuRegionTimerImpl() = default; + ~GpuRegionTimerImpl() = default; + inline void openTimingRegion(CommandStream){} + inline void closeTimingRegion(CommandStream){} + + inline double getLastRangeTime() + { + double milliseconds = 0.0; + for (size_t i = 0; i < currentEvent_; i++) + { + if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass + { + cl_ulong start_ns, end_ns; + cl_int gmx_unused cl_error; + + cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &start_ns, nullptr); + GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure"); + cl_error = clGetEventProfilingInfo(events_[i], CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &end_ns, nullptr); + GMX_ASSERT(CL_SUCCESS == cl_error, "GPU timing update failure"); + milliseconds += (end_ns - start_ns) / 1000000.0; + } + } + reset(); + return milliseconds; + } + + inline void reset() + { + for (size_t i = 0; i < currentEvent_; i++) + { + if (events_[i]) // This conditional is ugly, but is required to make some tests (e.g. empty domain) pass + { + GMX_ASSERT(CL_SUCCESS == clReleaseEvent(events_[i]), "OpenCL event release failure"); + } + } + currentEvent_ = 0; + // As long as we're doing nullptr checks, we might want to be extra cautious. + events_.fill(nullptr); + } + + inline CommandEvent *fetchNextEvent() + { + GMX_ASSERT(currentEvent_ < events_.size(), "Increase c_maxEventNumber_ if needed"); + cl_event *result = &events_[currentEvent_]; + currentEvent_++; + return result; + } +}; + +#endif diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 86109f7d0c..ed6d41064d 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -384,8 +384,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, /* beginning of timed HtoD section */ if (bDoTime) { - stat = cudaEventRecord(t->start_nb_h2d[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_h2d[iloc].openTimingRegion(stream); } /* HtoD x, q */ @@ -394,8 +393,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_h2d[iloc].closeTimingRegion(stream); } /* When we get here all misc operations issues in the local stream as well as @@ -433,8 +431,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, /* beginning of timed nonbonded calculation section */ if (bDoTime) { - stat = cudaEventRecord(t->start_nb_k[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_k[iloc].openTimingRegion(stream); } /* get the pointer to the kernel flavor we need to use */ @@ -490,8 +487,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(t->stop_nb_k[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_k[iloc].closeTimingRegion(stream); } #if (defined(WIN32) || defined( _WIN32 )) @@ -517,8 +513,6 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, int iloc, int numParts) { - cudaError_t stat; - cu_atomdata_t *adat = nb->atdat; cu_nbparam_t *nbp = nb->nbparam; cu_plist_t *plist = nb->plist[iloc]; @@ -569,18 +563,16 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, return; } - cudaEvent_t startEvent, stopEvent; + GpuRegionTimer *timer = nullptr; if (bDoTime) { - startEvent = (plist->haveFreshList ? t->start_prune_k[iloc] : t->start_rollingPrune_k[iloc]); - stopEvent = (plist->haveFreshList ? t->stop_prune_k[iloc] : t->stop_rollingPrune_k[iloc]); + timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]); } /* beginning of timed prune calculation section */ if (bDoTime) { - stat = cudaEventRecord(startEvent, stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + timer->openTimingRegion(stream); } /* Kernel launch config: @@ -654,8 +646,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(stopEvent, stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + timer->closeTimingRegion(stream); } #if (defined(WIN32) || defined( _WIN32 )) @@ -719,8 +710,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, /* beginning of timed D2H section */ if (bDoTime) { - stat = cudaEventRecord(t->start_nb_d2h[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_d2h[iloc].openTimingRegion(stream); } /* With DD the local D2H transfer can only start after the non-local @@ -767,8 +757,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + t->nb_d2h[iloc].closeTimingRegion(stream); } } @@ -785,7 +774,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, * \param[inout] timings GPU task timing data * \param[in] iloc interaction locality */ -static void countPruneKernelTime(const cu_timers_t *timers, +static void countPruneKernelTime(cu_timers_t *timers, gmx_wallclock_gpu_t *timings, const int iloc) { @@ -798,14 +787,12 @@ static void countPruneKernelTime(const cu_timers_t *timers, if (timers->didPrune[iloc]) { timings->pruneTime.c++; - timings->pruneTime.t += cu_event_elapsed(timers->start_prune_k[iloc], - timers->stop_prune_k[iloc]); + timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime(); } if (timers->didRollingPrune[iloc]) { timings->dynamicPruneTime.c++; - timings->dynamicPruneTime.t += cu_event_elapsed(timers->start_rollingPrune_k[iloc], - timers->stop_rollingPrune_k[iloc]); + timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime(); } } @@ -868,14 +855,11 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb, } /* kernel timings */ - timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += - cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]); + timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime(); /* X/q H2D and F D2H timings */ - timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc], - timers->stop_nb_h2d[iloc]); - timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc], - timers->stop_nb_d2h[iloc]); + timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime(); + timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime(); /* Count the pruning kernel times for both cases:1st pass (at search step) and rolling pruning (if called at the previous step). @@ -892,12 +876,10 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb, if (LOCAL_A(aloc)) { timings->pl_h2d_c++; - timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat, - timers->stop_atdat); + timings->pl_h2d_t += timers->atdat.getLastRangeTime(); } - timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc], - timers->stop_pl_h2d[iloc]); + timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime(); /* Clear the timing flag for the next step */ timers->didPairlistH2D[iloc] = false; diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 991e558631..b0ef37ffea 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -67,8 +67,6 @@ #include "nbnxn_cuda.h" #include "nbnxn_cuda_types.h" -static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */ - /* This is a heuristically determined parameter for the Fermi, Kepler * and Maxwell architectures for the minimum size of ci lists by multiplying * this constant with the # of multiprocessors on the current device. @@ -389,47 +387,9 @@ static void init_plist(cu_plist_t *pl) /*! Initializes the timer data structure. */ static void init_timers(cu_timers_t *t, bool bUseTwoStreams) { - cudaError_t stat; - int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault ); - - stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed"); - stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed"); - /* The non-local counters/stream (second in the array) are needed only with DD. */ for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++) { - stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed"); - stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed"); - - stat = cudaEventCreateWithFlags(&(t->start_prune_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_prune_k failed"); - stat = cudaEventCreateWithFlags(&(t->stop_prune_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_prune_k failed"); - - stat = cudaEventCreateWithFlags(&(t->start_rollingPrune_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_rollingPrune_k failed"); - stat = cudaEventCreateWithFlags(&(t->stop_rollingPrune_k[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_rollingPrune_k failed"); - - stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed"); - stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed"); - - stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed"); - stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed"); - - stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed"); - stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags); - CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed"); - t->didPairlistH2D[i] = false; t->didPrune[i] = false; t->didRollingPrune[i] = false; @@ -500,7 +460,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, nb->bUseTwoStreams = bLocalAndNonlocal; - snew(nb->timers, 1); + nb->timers = new cu_timers_t(); snew(nb->timings, 1); /* init nbst */ @@ -578,7 +538,6 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, } char sbuf[STRLEN]; - cudaError_t stat; bool bDoTime = nb->bDoTime; cudaStream_t stream = nb->stream[iloc]; cu_plist_t *d_plist = nb->plist[iloc]; @@ -599,8 +558,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + nb->timers->pl_h2d[iloc].openTimingRegion(stream); nb->timers->didPairlistH2D[iloc] = true; } @@ -627,8 +585,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream); - CU_RET_ERR(stat, "cudaEventRecord failed"); + nb->timers->pl_h2d[iloc].closeTimingRegion(stream); } /* the next use of thist list we be the first one, so we need to prune */ @@ -704,8 +661,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, if (bDoTime) { /* time async copy */ - stat = cudaEventRecord(timers->start_atdat, ls); - CU_RET_ERR(stat, "cudaEventRecord failed"); + timers->atdat.openTimingRegion(ls); } /* need to reallocate if we have to copy more atoms than the amount of space @@ -764,8 +720,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - stat = cudaEventRecord(timers->stop_atdat, ls); - CU_RET_ERR(stat, "cudaEventRecord failed"); + timers->atdat.closeTimingRegion(ls); } } @@ -785,7 +740,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) cu_atomdata_t *atdat; cu_nbparam_t *nbparam; cu_plist_t *plist, *plist_nl; - cu_timers_t *timers; if (nb == NULL) { @@ -796,7 +750,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) nbparam = nb->nbparam; plist = nb->plist[eintLocal]; plist_nl = nb->plist[eintNonlocal]; - timers = nb->timers; nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info); @@ -805,48 +758,14 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done); CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done"); + delete nb->timers; if (nb->bDoTime) { - stat = cudaEventDestroy(timers->start_atdat); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat"); - stat = cudaEventDestroy(timers->stop_atdat); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat"); - /* The non-local counters/stream (second in the array) are needed only with DD. */ for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++) { - stat = cudaEventDestroy(timers->start_nb_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k"); - stat = cudaEventDestroy(timers->stop_nb_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k"); - - stat = cudaEventDestroy(timers->start_prune_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_prune_k"); - stat = cudaEventDestroy(timers->stop_prune_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_prune_k"); - - stat = cudaEventDestroy(timers->start_rollingPrune_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_rollingPrune_k"); - stat = cudaEventDestroy(timers->stop_rollingPrune_k[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_rollingPrune_k"); - - stat = cudaEventDestroy(timers->start_pl_h2d[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d"); - stat = cudaEventDestroy(timers->stop_pl_h2d[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d"); - stat = cudaStreamDestroy(nb->stream[i]); CU_RET_ERR(stat, "cudaStreamDestroy failed on stream"); - - stat = cudaEventDestroy(timers->start_nb_h2d[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d"); - stat = cudaEventDestroy(timers->stop_nb_h2d[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d"); - - stat = cudaEventDestroy(timers->start_nb_d2h[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h"); - stat = cudaEventDestroy(timers->stop_nb_d2h[i]); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h"); } } @@ -897,7 +816,6 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) { sfree(plist_nl); } - sfree(timers); sfree(nb->timings); sfree(nb); diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index bbd283d075..ceb3a3dafd 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -48,6 +48,7 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/gpuregiontimer.cuh" #include "gromacs/mdlib/nbnxn_consts.h" #include "gromacs/mdlib/nbnxn_pairlist.h" #include "gromacs/mdtypes/interaction_const.h" @@ -228,30 +229,23 @@ struct cu_plist }; /** \internal - * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers. + * \brief CUDA timers used for timing GPU kernels and H2D/D2H transfers. * * The two-sized arrays hold the local and non-local values and should always * be indexed with eintLocal/eintNonlocal. */ struct cu_timers { - cudaEvent_t start_atdat; /**< start event for atom data transfer (every PS step) */ - cudaEvent_t stop_atdat; /**< stop event for atom data transfer (every PS step) */ - cudaEvent_t start_nb_h2d[2]; /**< start events for x/q H2D transfers (l/nl, every step) */ - cudaEvent_t stop_nb_h2d[2]; /**< stop events for x/q H2D transfers (l/nl, every step) */ - cudaEvent_t start_nb_d2h[2]; /**< start events for f D2H transfer (l/nl, every step) */ - cudaEvent_t stop_nb_d2h[2]; /**< stop events for f D2H transfer (l/nl, every step) */ - cudaEvent_t start_pl_h2d[2]; /**< start events for pair-list H2D transfers (l/nl, every PS step) */ - cudaEvent_t stop_pl_h2d[2]; /**< start events for pair-list H2D transfers (l/nl, every PS step) */ - bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */ - cudaEvent_t start_nb_k[2]; /**< start event for non-bonded kernels (l/nl, every step) */ - cudaEvent_t stop_nb_k[2]; /**< stop event non-bonded kernels (l/nl, every step) */ - cudaEvent_t start_prune_k[2]; /**< start event for the 1st pass list pruning kernel (l/nl, every PS step) */ - cudaEvent_t stop_prune_k[2]; /**< stop event for the 1st pass list pruning kernel (l/nl, every PS step) */ - bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */ - cudaEvent_t start_rollingPrune_k[2]; /**< start event for rolling pruning kernels (l/nl, frequency depends on chunk size) */ - cudaEvent_t stop_rollingPrune_k[2]; /**< stop event for rolling pruning kernels (l/nl, frequency depends on chunk size) */ - bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */ + GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */ + GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */ + GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */ + GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */ + bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */ + GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */ + GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */ + bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */ + GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */ + bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */ }; /** \internal diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index 3b37b32a81..813e0bdcfb 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -376,8 +376,10 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event) * calling clFinish or clWaitForEvents. * The function returns 0.0 if the input event, *ocl_event, is 0. * Don't use this function when more than one wait will be issued for the event. + * \todo This function, as well as some CUDA counterparts, is superseded by GpuRegionTimer. + * Delete. */ -static double ocl_event_elapsed_ms(cl_event *ocl_event) +static inline double ocl_event_elapsed_ms(cl_event *ocl_event) { cl_int gmx_unused cl_error; cl_ulong start_ns, end_ns; @@ -484,10 +486,19 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, } /* beginning of timed HtoD section */ + if (bDoTime) + { + t->nb_h2d[iloc].openTimingRegion(stream); + } /* HtoD x, q */ ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, - adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL); + adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr); + + if (bDoTime) + { + t->nb_h2d[iloc].closeTimingRegion(stream); + } /* When we get here all misc operations issues in the local stream as well as the local xq H2D are done, @@ -534,6 +545,10 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, } /* beginning of timed nonbonded calculation section */ + if (bDoTime) + { + t->nb_k[iloc].openTimingRegion(stream); + } /* get the pointer to the kernel flavor we need to use */ nb_kernel = select_nbnxn_kernel(nb, @@ -625,9 +640,14 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, { printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str()); } - cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL); + cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr); assert(cl_error == CL_SUCCESS); + if (bDoTime) + { + t->nb_k[iloc].closeTimingRegion(stream); + } + #ifdef DEBUG_OCL { static int run_step = 1; @@ -763,6 +783,18 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, return; } + GpuRegionTimer *timer = nullptr; + if (bDoTime) + { + timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]); + } + + /* beginning of timed prune calculation section */ + if (bDoTime) + { + timer->openTimingRegion(stream); + } + /* Kernel launch config: * - The thread block dimensions match the size of i-clusters, j-clusters, * and j-cluster concurrency, in x, y, and z, respectively. @@ -812,15 +844,9 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr); assert(cl_error == CL_SUCCESS); - cl_event *pruneEventPtr = nullptr; - if (bDoTime) - { - pruneEventPtr = plist->haveFreshList ? &t->prune_k[iloc] : &t->rollingPrune_k[iloc]; - } - cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3, nullptr, global_work_size, local_work_size, - 0, nullptr, pruneEventPtr); + 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr); GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str()); if (plist->haveFreshList) @@ -834,6 +860,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, /* Mark that rolling pruning has been done */ nb->timers->didRollingPrune[iloc] = true; } + + if (bDoTime) + { + timer->closeTimingRegion(stream); + } } /*! \brief @@ -904,6 +935,10 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, } /* beginning of timed D2H section */ + if (bDoTime) + { + t->nb_d2h[iloc].openTimingRegion(stream); + } /* With DD the local D2H transfer can only start after the non-local has been launched. */ @@ -914,7 +949,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, /* DtoH f */ ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float), - (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL); + (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); /* kick off work */ cl_error = clFlush(stream); @@ -942,19 +977,24 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, if (bCalcFshift) { ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, - SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL); + SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); } /* DtoH energies */ if (bCalcEner) { ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, - sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL); + sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, - sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL); + sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); } } + + if (bDoTime) + { + t->nb_d2h[iloc].closeTimingRegion(stream); + } } /*! \brief Count pruning kernel time if either kernel has been triggered @@ -983,13 +1023,13 @@ static void countPruneKernelTime(cl_timers_t *timers, if (timers->didPrune[iloc]) { timings->pruneTime.c++; - timings->pruneTime.t += ocl_event_elapsed_ms(&timers->prune_k[iloc]); + timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime(); } if (timers->didRollingPrune[iloc]) { timings->dynamicPruneTime.c++; - timings->dynamicPruneTime.t += ocl_event_elapsed_ms(&timers->rollingPrune_k[iloc]); + timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime(); } } @@ -1057,16 +1097,11 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb, } /* kernel timings */ - - timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += - ocl_event_elapsed_ms(timers->nb_k + iloc); + timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t += timers->nb_k[iloc].getLastRangeTime(); /* X/q H2D and F D2H timings */ - timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d + iloc); - timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f + iloc); - timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc); - timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el + iloc); - timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj + iloc); + timings->nb_h2d_t += timers->nb_h2d[iloc].getLastRangeTime(); + timings->nb_d2h_t += timers->nb_d2h[iloc].getLastRangeTime(); /* Count the pruning kernel times for both cases:1st pass (at search step) and rolling pruning (if called at the previous step). @@ -1083,13 +1118,10 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb, if (LOCAL_A(aloc)) { timings->pl_h2d_c++; - timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat)); + timings->pl_h2d_t += timers->atdat.getLastRangeTime(); } - timings->pl_h2d_t += - ocl_event_elapsed_ms(timers->pl_h2d_sci + iloc) + - ocl_event_elapsed_ms(timers->pl_h2d_cj4 + iloc) + - ocl_event_elapsed_ms(timers->pl_h2d_excl + iloc); + timings->pl_h2d_t += timers->pl_h2d[iloc].getLastRangeTime(); /* Clear the timing flag for the next step */ timers->didPairlistH2D[iloc] = false; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index 399ea68b82..6dd8e4f669 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -746,7 +746,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, nb->bUseTwoStreams = bLocalAndNonlocal; - snew(nb->timers, 1); + nb->timers = new cl_timers_t(); snew(nb->timings, 1); /* set device info, just point it to the right GPU among the detected ones */ @@ -907,6 +907,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, } char sbuf[STRLEN]; + bool bDoTime = nb->bDoTime; cl_command_queue stream = nb->stream[iloc]; cl_plist_t *d_plist = nb->plist[iloc]; @@ -924,8 +925,9 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, } } - if (nb->bDoTime) + if (bDoTime) { + nb->timers->pl_h2d[iloc].openTimingRegion(stream); nb->timers->didPairlistH2D[iloc] = true; } @@ -933,26 +935,31 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, &d_plist->nsci, &d_plist->sci_nalloc, h_plist->nsci, nb->dev_rundata->context, - stream, true, &(nb->timers->pl_h2d_sci[iloc])); + stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t), &d_plist->ncj4, &d_plist->cj4_nalloc, h_plist->ncj4, nb->dev_rundata->context, - stream, true, &(nb->timers->pl_h2d_cj4[iloc])); + stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); - /* this call only allocates space on the device (no data is transferred) */ + /* this call only allocates space on the device (no data is transferred) - no timing as well! */ ocl_realloc_buffered(&d_plist->imask, NULL, sizeof(unsigned int), &d_plist->nimask, &d_plist->imask_nalloc, h_plist->ncj4*c_nbnxnGpuClusterpairSplit, nb->dev_rundata->context, - stream, true, &(nb->timers->pl_h2d_imask[iloc])); + stream, true); ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t), &d_plist->nexcl, &d_plist->excl_nalloc, h_plist->nexcl, nb->dev_rundata->context, - stream, true, &(nb->timers->pl_h2d_excl[iloc])); + stream, true, bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + + if (bDoTime) + { + nb->timers->pl_h2d[iloc].closeTimingRegion(stream); + } /* need to prune the pair list during the next step */ d_plist->haveFreshList = true; @@ -989,6 +996,12 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, natoms = nbat->natoms; realloced = false; + if (bDoTime) + { + /* time async copy */ + timers->atdat.openTimingRegion(ls); + } + /* need to reallocate if we have to copy more atoms than the amount of space available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ if (natoms > d_atdat->nalloc) @@ -1046,15 +1059,20 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, if (useLjCombRule(nb->nbparam->vdwtype)) { ocl_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb, 0, - natoms*sizeof(cl_float2), ls, bDoTime ? &(timers->atdat) : NULL); + natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0, - natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL); + natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } + if (bDoTime) + { + timers->atdat.closeTimingRegion(ls); + } + /* kick off the tasks enqueued above to ensure concurrency with the search */ cl_error = clFlush(ls); assert(CL_SUCCESS == cl_error); @@ -1211,7 +1229,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) sfree(nb->dev_rundata); /* Free timers and timings */ - sfree(nb->timers); + delete nb->timers; sfree(nb->timings); sfree(nb); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h index 81b2df83b0..74e98db9e3 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h @@ -55,6 +55,7 @@ # include #endif +#include "gromacs/gpu_utils/gpuregiontimer_ocl.h" #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/mdlib/nbnxn_pairlist.h" #include "gromacs/mdtypes/interaction_const.h" @@ -297,7 +298,6 @@ typedef struct cl_plist int rollingPruningPart; /**< the next part to which the roling pruning needs to be applied */ }cl_plist_t; - /*! \internal * \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers. * @@ -306,27 +306,16 @@ typedef struct cl_plist */ typedef struct cl_timers { - cl_event atdat; /**< event for atom data transfer (every PS step) */ - - cl_event nb_h2d[2]; /**< events for x/q H2D transfers (l/nl, every step) */ - - cl_event nb_d2h_f[2]; /**< events for f D2H transfer (l/nl, every step) */ - cl_event nb_d2h_fshift[2]; /**< events for fshift D2H transfer (l/nl, every step) */ - cl_event nb_d2h_e_el[2]; /**< events for e_el D2H transfer (l/nl, every step) */ - cl_event nb_d2h_e_lj[2]; /**< events for e_lj D2H transfer (l/nl, every step) */ - - cl_event pl_h2d_sci[2]; /**< events for pair-list sci H2D transfers (l/nl, every PS step) */ - cl_event pl_h2d_cj4[2]; /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */ - cl_event pl_h2d_excl[2]; /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/ - cl_event pl_h2d_imask[2]; /**< events for pair-list imask H2D transfers (l/nl, every PS step)*/ - - cl_event nb_k[2]; /**< event for non-bonded kernels (l/nl, every step) */ - - bool didPairlistH2D[2]; /**< tells if we timed a pair-list transfer */ - cl_event prune_k[2]; /**< event for pruning kernel (every prune step) */ - bool didPrune[2]; /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */ - cl_event rollingPrune_k[2]; /**< event for rolling pruning kernel (every prune step) */ - bool didRollingPrune[2]; /**< tells uf we timed pruning this (or previous step for rolling) and that the timings need to be accounted for */ + GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */ + GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */ + GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */ + GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */ + bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */ + GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */ + GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */ + bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */ + GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */ + bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */ } cl_timers_t; /*! \internal -- 2.22.0