From d1077ceaf9b04a011ec826e3d677dede64fe3ec7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 3 Nov 2020 12:47:16 +0000 Subject: [PATCH] Add missing wallcycle_stop On the GPU update path the Update CPU counter was started and not correctly stopped. Refs #3764 --- src/gromacs/mdlib/update_constrain_gpu.h | 16 +++++--- .../mdlib/update_constrain_gpu_impl.cpp | 3 +- .../mdlib/update_constrain_gpu_impl.cu | 38 +++++++++++++++++-- src/gromacs/mdlib/update_constrain_gpu_impl.h | 17 ++++++--- src/gromacs/mdrun/md.cpp | 4 +- src/gromacs/timing/wallcycle.cpp | 1 + src/gromacs/timing/wallcycle.h | 1 + 7 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h index a30cbe825d..581851f9a2 100644 --- a/src/gromacs/mdlib/update_constrain_gpu.h +++ b/src/gromacs/mdlib/update_constrain_gpu.h @@ -46,6 +46,7 @@ #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/mdtypes/group.h" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" @@ -69,10 +70,10 @@ public: /*! \brief Create Update-Constrain object. * * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain - * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify - * coordinates. The event is maintained outside this class and also passed to all (if any) consumers - * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the - * markEvent(...) method is called unconditionally. + * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that + * modify coordinates. The event is maintained outside this class and also passed to all (if + * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr + * because the markEvent(...) method is called unconditionally. * * \param[in] ir Input record data: LINCS takes number of iterations and order of * projection from it. @@ -80,13 +81,16 @@ public: * and target O-H and H-H distances from this object. * \param[in] deviceContext GPU device context. * \param[in] deviceStream GPU stream to use. - * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. + * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done + * on the GPU. + * \param[in] wcycle The wallclock counter */ UpdateConstrainGpu(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice); + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle); ~UpdateConstrainGpu(); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp index 5c4afd2acd..dc2b0421c3 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp @@ -60,7 +60,8 @@ UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir */, const gmx_mtop_t& /* mtop */, const DeviceContext& /* deviceContext */, const DeviceStream& /* deviceStream */, - GpuEventSynchronizer* /* xUpdatedOnDevice */) : + GpuEventSynchronizer* /* xUpdatedOnDevice */, + gmx_wallcycle* /*wcycle*/) : impl_(nullptr) { GMX_ASSERT(!impl_, diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index b9dd8632db..825890ce82 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -67,6 +67,7 @@ #include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/mdlib/update_constrain_gpu.h" #include "gromacs/mdtypes/mdatom.h" +#include "gromacs/timing/wallcycle.h" namespace gmx { @@ -116,6 +117,9 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead const float dtPressureCouple, const matrix prVelocityScalingMatrix) { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + // Clearing virial matrix // TODO There is no point in having separate virial matrix for constraints clear_mat(virial); @@ -145,11 +149,17 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead coordinatesReady_->markEvent(deviceStream_); + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + return; } void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + ScalingMatrix mu; mu.xx = scalingMatrix[XX][XX]; mu.yy = scalingMatrix[YY][YY]; @@ -165,10 +175,16 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) // TODO: Although this only happens on the pressure coupling steps, this synchronization // can affect the performance if nstpcouple is small. deviceStream_.synchronize(); + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + ScalingMatrix mu; mu.xx = scalingMatrix[XX][XX]; mu.yy = scalingMatrix[YY][YY]; @@ -184,16 +200,21 @@ void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) // TODO: Although this only happens on the pressure coupling steps, this synchronization // can affect the performance if nstpcouple is small. deviceStream_.synchronize(); + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice) : + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle) : deviceContext_(deviceContext), deviceStream_(deviceStream), - coordinatesReady_(xUpdatedOnDevice) + coordinatesReady_(xUpdatedOnDevice), + wcycle_(wcycle) { GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr."); @@ -217,6 +238,10 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, const t_mdatoms& md, const int numTempScaleValues) { + // TODO wallcycle + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null."); GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null."); @@ -239,10 +264,14 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, coordinateScalingKernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock; + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box) { + // TODO wallcycle setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_); } @@ -255,8 +284,9 @@ UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice) : - impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice)) + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle) : + impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice, wcycle)) { } diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 9ee0677913..7453a98105 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -66,10 +66,10 @@ public: /*! \brief Create Update-Constrain object. * * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain - * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify - * coordinates. The event is maintained outside this class and also passed to all (if any) consumers - * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the - * markEvent(...) method is called unconditionally. + * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that + * modify coordinates. The event is maintained outside this class and also passed to all (if + * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr + * because the markEvent(...) method is called unconditionally. * * \param[in] ir Input record data: LINCS takes number of iterations and order of * projection from it. @@ -77,13 +77,16 @@ public: * and target O-H and H-H distances from this object. * \param[in] deviceContext GPU device context. * \param[in] deviceStream GPU stream to use. - * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. + * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that + * update is done on the GPU. + * \param[in] wcycle The wallclock counter */ Impl(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice); + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle); ~Impl(); @@ -220,6 +223,8 @@ private: //! An pointer to the event to indicate when the update of coordinates is complete GpuEventSynchronizer* coordinatesReady_; + //! The wallclock counter + gmx_wallcycle* wcycle_ = nullptr; }; } // namespace gmx diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index f010e6a10a..c20574e37f 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -415,7 +415,7 @@ void gmx::LegacySimulator::do_md() integrator = std::make_unique( *ir, *top_global, fr->deviceStreamManager->context(), fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints), - stateGpu->xUpdatedOnDevice()); + stateGpu->xUpdatedOnDevice(), wcycle); integrator->setPbc(PbcType::Xyz, state->box); } @@ -1255,6 +1255,8 @@ void gmx::LegacySimulator::do_md() if (useGpuForUpdate) { + wallcycle_stop(wcycle, ewcUPDATE); + if (bNS && (bFirstStep || DOMAINDECOMP(cr))) { integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index e69df666c1..f1b428bf05 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -180,6 +180,7 @@ static const char* wcsn[ewcsNR] = { "Launch GPU NB F buffer ops.", "Launch GPU Comm. coord.", "Launch GPU Comm. force.", + "Launch GPU update", "Test subcounter", }; diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 0bc53f0fe6..2cbc7fd3cb 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -132,6 +132,7 @@ enum ewcsLAUNCH_GPU_NB_F_BUF_OPS, ewcsLAUNCH_GPU_MOVEX, ewcsLAUNCH_GPU_MOVEF, + ewcsLAUNCH_GPU_UPDATE_CONSTRAIN, ewcsTEST, ewcsNR }; -- 2.22.0