From: Artem Zhmurov Date: Wed, 24 Feb 2021 13:42:23 +0000 (+0300) Subject: Use GpuTimers in CUDA and OpenCL versions of NBNXM directly X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=177e8e3ab38b642224a64486a40789f6e4051f22;p=alexxy%2Fgromacs.git Use GpuTimers in CUDA and OpenCL versions of NBNXM directly The GpuTimers are used through proxy objects in OpenCL and CUDA versions of NBNXM, which complicates the unification of the code. This change eliminates the proxy object by using the underlying object directly. Refs #2608 --- diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index ccb1fdf26d..0ddaa17b14 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -457,7 +457,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -489,7 +489,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* beginning of timed HtoD section */ if (bDoTime) { - t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); + timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } /* HtoD x, q */ @@ -505,7 +505,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom if (bDoTime) { - t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); + timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); } /* When we get here all misc operations issued in the local stream as well as @@ -539,7 +539,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -578,7 +578,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->interaction[iloc].nb_k.openTimingRegion(deviceStream); + timers->interaction[iloc].nb_k.openTimingRegion(deviceStream); } /* Kernel launch config: @@ -619,7 +619,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In config.sharedMemorySize); } - auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; + auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr; const auto kernel = select_nbnxn_kernel(nbp->elecType, nbp->vdwType, @@ -632,7 +632,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In if (bDoTime) { - t->interaction[iloc].nb_k.closeTimingRegion(deviceStream); + timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream); } if (GMX_NATIVE_WINDOWS) @@ -660,7 +660,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -711,7 +711,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c GpuRegionTimer* timer = nullptr; if (bDoTime) { - timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k); + timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k + : timers->interaction[iloc].rollingPrune_k); } /* beginning of timed prune calculation section */ @@ -800,7 +801,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* extract the data */ NBAtomData* adat = nb->atdat; - cu_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; bool bDoTime = nb->bDoTime; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -817,7 +818,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* beginning of timed D2H section */ if (bDoTime) { - t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); + timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); } /* With DD the local D2H transfer can only start after the non-local @@ -884,7 +885,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, if (bDoTime) { - t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); + timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); } } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 2505422927..f4467c0248 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -194,7 +194,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nb->bUseTwoStreams = bLocalAndNonlocal; - nb->timers = new cu_timers_t(); + nb->timers = new Nbnxm::GpuTimers(); snew(nb->timings, 1); /* init nbst */ @@ -313,7 +313,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) int nalloc, natoms; bool realloced; bool bDoTime = nb->bDoTime; - cu_timers_t* timers = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; NBAtomData* d_atdat = nb->atdat; const DeviceContext& deviceContext = *nb->deviceContext_; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 08d96de90f..f3d1bb5acc 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -64,11 +64,6 @@ /*! \brief cluster size = number of atoms per cluster. */ static constexpr int c_clSize = c_nbnxnGpuClusterSize; -/** \internal - * \brief Typedef of actual timer type. - */ -typedef struct Nbnxm::gpu_timers_t cu_timers_t; - /*! \internal * \brief Main data structure for CUDA nonbonded force calculations. */ @@ -140,7 +135,7 @@ struct NbnxmGpu /*! \brief True if event-based timing is enabled. */ bool bDoTime = false; /*! \brief CUDA event-based timers. */ - cu_timers_t* timers = nullptr; + Nbnxm::GpuTimers* timers = nullptr; /*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */ gmx_wallclock_gpu_nbnxn_t* timings = nullptr; }; diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 92002bbf4b..b2663b33f9 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -193,19 +193,18 @@ static inline gmx::Range getGpuAtomRange(const NBAtomData* atomData, const * - 1st pass prune: ran during the current step (prior to the force kernel); * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq); * - * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen - * after calling this function. + * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune + * should happen after calling this function. * * \param[in] timers structs with GPU timer objects * \param[inout] timings GPU task timing data * \param[in] iloc interaction locality */ -template -static void countPruneKernelTime(GpuTimers* timers, +static void countPruneKernelTime(Nbnxm::GpuTimers* timers, gmx_wallclock_gpu_nbnxn_t* timings, const InteractionLocality iloc) { - gpu_timers_t::Interaction& iTimers = timers->interaction[iloc]; + GpuTimers::Interaction& iTimers = timers->interaction[iloc]; // We might have not done any pruning (e.g. if we skipped with empty domains). if (!iTimers.didPrune && !iTimers.didRollingPrune) @@ -281,7 +280,6 @@ static inline void gpu_reduce_staged_outputs(const NBStagingData& nbst, * counters could end up being inconsistent due to not being incremented * on some of the node when this is skipped on empty local domains! * - * \tparam GpuTimers GPU timers type * \tparam GpuPairlist Pair list type * \param[out] timings Pointer to the NB GPU timings data * \param[in] timers Pointer to GPU timers data @@ -291,9 +289,9 @@ static inline void gpu_reduce_staged_outputs(const NBStagingData& nbst, * \param[in] doTiming True if timing is enabled. * */ -template +template static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings, - GpuTimers* timers, + Nbnxm::GpuTimers* timers, const GpuPairlist* plist, AtomLocality atomLocality, const gmx::StepWorkload& stepWork, diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 85c5853ebd..4b7f0e762e 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -205,7 +205,7 @@ using gmx::InteractionLocality; * The two-sized arrays hold the local and non-local values and should always * be indexed with eintLocal/eintNonlocal. */ -struct gpu_timers_t +struct GpuTimers { /*! \internal * \brief Timers for local or non-local coordinate/force transfers @@ -244,7 +244,7 @@ struct gpu_timers_t //! timers for coordinate/force transfers (every step) gmx::EnumerationArray xf; //! timers for interaction related transfers - gmx::EnumerationArray interaction; + gmx::EnumerationArray interaction; }; /*! \internal diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index aad56f4915..51f7745f7f 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -258,7 +258,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte } } - gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc]; + GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc]; if (bDoTime) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 50e7b9d8d4..b8107fc857 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -529,7 +529,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -561,7 +561,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* beginning of timed HtoD section */ if (bDoTime) { - t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); + timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } /* HtoD x, q */ @@ -573,11 +573,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); + bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); + timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); } /* When we get here all misc operations issued in the local stream as well as @@ -613,7 +613,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -655,7 +655,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->interaction[iloc].nb_k.openTimingRegion(deviceStream); + timers->interaction[iloc].nb_k.openTimingRegion(deviceStream); } /* kernel launch config */ @@ -685,7 +685,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb fillin_ocl_structures(nbp, &nbparams_params); - auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; + auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr; constexpr char kernelName[] = "k_calc_nb"; const auto kernel = select_nbnxn_kernel(nb, @@ -745,7 +745,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb if (bDoTime) { - t->interaction[iloc].nb_k.closeTimingRegion(deviceStream); + timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream); } } @@ -784,7 +784,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -834,7 +834,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c GpuRegionTimer* timer = nullptr; if (bDoTime) { - timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k); + timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k + : timers->interaction[iloc].rollingPrune_k); } /* beginning of timed prune calculation section */ @@ -933,7 +934,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, "beginning of the copy back function."); NBAtomData* adat = nb->atdat; - cl_timers_t* t = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; bool bDoTime = nb->bDoTime; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -958,7 +959,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* beginning of timed D2H section */ if (bDoTime) { - t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); + timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); } /* With DD the local D2H transfer can only start after the non-local @@ -978,7 +979,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); /* kick off work */ cl_error = clFlush(deviceStream.stream()); @@ -1009,7 +1010,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, SHIFTS, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ @@ -1023,7 +1024,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, 1, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); static_assert(sizeof(*nb->nbst.eElec) == sizeof(float), "Sizes of host- and device-side electrostatic energy terms should be the " "same."); @@ -1033,13 +1034,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, 1, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); } } if (bDoTime) { - t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); + timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); } } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 25cb3158b2..a2ee20416f 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -297,7 +297,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nb->bUseTwoStreams = bLocalAndNonlocal; - nb->timers = new cl_timers_t(); + nb->timers = new Nbnxm::GpuTimers(); snew(nb->timings, 1); /* set device info, just point it to the right GPU among the detected ones */ @@ -424,7 +424,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) int nalloc, natoms; bool realloced; bool bDoTime = nb->bDoTime; - cl_timers_t* timers = nb->timers; + Nbnxm::GpuTimers* timers = nb->timers; NBAtomData* d_atdat = nb->atdat; const DeviceContext& deviceContext = *nb->deviceContext_; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 474d90700d..c802c4199f 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -127,11 +127,6 @@ typedef struct cl_nbparam_params } cl_nbparam_params_t; -/** \internal - * \brief Typedef of actual timer type. - */ -typedef struct Nbnxm::gpu_timers_t cl_timers_t; - /*! \internal * \brief Main data structure for OpenCL nonbonded force calculations. */ @@ -206,7 +201,7 @@ struct NbnxmGpu //! True if event-based timing is enabled. bool bDoTime = false; //! OpenCL event-based timers. - cl_timers_t* timers = nullptr; + Nbnxm::GpuTimers* timers = nullptr; //! Timing data. TODO: deprecate this and query timers for accumulated data instead gmx_wallclock_gpu_nbnxn_t* timings = nullptr; }; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h index c1e23c1a74..fd1d655e3e 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -85,7 +85,7 @@ struct NbnxmGpu /*! \brief True if event-based timing is enabled. Always false for SYCL. */ bool bDoTime = false; /*! \brief Dummy timers. */ - Nbnxm::gpu_timers_t* timers = nullptr; + Nbnxm::GpuTimers* timers = nullptr; /*! \brief Dummy timing data. */ gmx_wallclock_gpu_nbnxn_t* timings = nullptr;