From 66d82466cf41fa8df221528d6dbdde2d6c621b1a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 22 Oct 2020 09:02:58 +0000 Subject: [PATCH] Add missing GPU force reduction cycle counting Launch timing was missing in the GPU reduction refactoring added in 257d80. This change adds back the cycle counting reducing the time leaking into the rest timer. --- src/gromacs/mdlib/gpuforcereduction.h | 6 +++++- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 3 ++- src/gromacs/mdlib/gpuforcereduction_impl.cu | 21 ++++++++++++++++---- src/gromacs/mdlib/gpuforcereduction_impl.cuh | 5 ++++- src/gromacs/mdrun/runner.cpp | 4 ++-- 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index 71a5f712a9..8aad5efaf3 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -45,6 +45,7 @@ #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" #include "gromacs/utility/fixedcapacityvector.h" @@ -73,8 +74,11 @@ public: * * \param [in] deviceContext GPU device context * \param [in] deviceStream Stream to use for reduction + * \param [in] wcycle Wall-clock cycle counter */ - GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream); + GpuForceReduction(const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + gmx_wallcycle* wcycle); ~GpuForceReduction(); /*! \brief Register a nbnxm-format force to be reduced diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index ff32deef96..69876c7d15 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -57,7 +57,8 @@ class GpuForceReduction::Impl }; GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */, - const DeviceStream& /* deviceStream */) : + const DeviceStream& /* deviceStream */, + gmx_wallcycle* /*wcycle*/) : impl_(nullptr) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu index 29c18daf33..afe7b03058 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -105,9 +105,12 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce, return; } -GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) : +GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + gmx_wallcycle* wcycle) : deviceContext_(deviceContext), - deviceStream_(deviceStream){}; + deviceStream_(deviceStream), + wcycle_(wcycle){}; void GpuForceReduction::Impl::reinit(float3* baseForcePtr, const int numAtoms, @@ -123,10 +126,13 @@ void GpuForceReduction::Impl::reinit(float3* baseForcePtr, accumulate_ = accumulate; completionMarker_ = completionMarker; cellInfo_.cell = cell.data(); + + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_); copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Async, nullptr); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); dependencyList_.clear(); }; @@ -150,6 +156,8 @@ void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependen void GpuForceReduction::Impl::execute() { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); if (numAtoms_ == 0) { @@ -191,12 +199,17 @@ void GpuForceReduction::Impl::execute() { completionMarker_->markEvent(deviceStream_); } + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } GpuForceReduction::Impl::~Impl(){}; -GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) : - impl_(new Impl(deviceContext, deviceStream)) +GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + gmx_wallcycle* wcycle) : + impl_(new Impl(deviceContext, deviceStream, wcycle)) { } diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh index 536e3fd33b..8434a04c71 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cuh +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh @@ -73,8 +73,9 @@ public: * * \param [in] deviceStream Stream to use for reduction * \param [in] deviceContext GPU device context + * \param [in] wcycle The wallclock counter */ - Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream); + Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle); ~Impl(); /*! \brief Register a nbnxm-format force to be reduced @@ -137,6 +138,8 @@ private: DeviceBuffer rvecForceToAdd_ = nullptr; //! event to be marked when redcution launch has been completed GpuEventSynchronizer* completionMarker_ = nullptr; + //! The wallclock counter + gmx_wallcycle* wcycle_ = nullptr; }; } // namespace gmx diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 9c60986fa3..4e27012f62 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1693,10 +1693,10 @@ int Mdrunner::mdrunner() { fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique( deviceStreamManager->context(), - deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal)); + deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle); fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique( deviceStreamManager->context(), - deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal)); + deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle); } std::unique_ptr stateGpu; -- 2.22.0