From: Szilárd Páll Date: Fri, 6 Dec 2019 18:57:24 +0000 (+0100) Subject: Add wallcycle counting to StatePropagatorDataGpu X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=3ed86da6032b0a5a69e092a372e927f332a89baf;p=alexxy%2Fgromacs.git Add wallcycle counting to StatePropagatorDataGpu Launch overheads are counted in the main GPU launch overhead counter and a separate subcounter is used for the launch and a main counter for the CPU blocking wait timing. Note that this chnge introduces mdtypes->timing->mdtypes cyclic dependency, the warning on which is suppressed. Fixes #3207 Change-Id: I3b69df9e4888800b43712a42b863958db80f5caa --- diff --git a/docs/doxygen/cycle-suppressions.txt b/docs/doxygen/cycle-suppressions.txt index 577d39c856..d49152a172 100644 --- a/docs/doxygen/cycle-suppressions.txt +++ b/docs/doxygen/cycle-suppressions.txt @@ -26,3 +26,7 @@ utility -> math # modular simulator uses shellfc from mdrun, but is later included in mdrun by simulator builder modularsimulator -> mdrun + +# Cycle counters in timing use comrec for the set up, which is in the mdtypes. This introduces +# cyclic dependencies if the cycle counting is used anywhere in mdtypes. +timing -> mdtypes \ No newline at end of file diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index be7a977daf..4e66664ae2 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -643,7 +643,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here. // This should be made safer. stateGpu = std::make_unique( - commandStream, deviceContext, GpuApiCallBehavior::Async, paddingSize); + commandStream, deviceContext, GpuApiCallBehavior::Async, paddingSize, wcycle); } diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index a57195d1f6..fc68e96f41 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -166,7 +166,7 @@ std::unique_ptr makeStatePropagatorDataGpu(const gmx_pme // restrict one from using other constructor here. return std::make_unique( pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme), - GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme)); + GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr); } //! PME initialization with atom data diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 2a1a0be8b0..1b699c4a4d 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1589,7 +1589,7 @@ int Mdrunner::mdrunner() : GpuApiCallBehavior::Sync; stateGpu = std::make_unique( - pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize); + pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle); fr->stateGpu = stateGpu.get(); } diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index c6ae19c589..697f9d2c86 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -60,6 +60,7 @@ #include "locality.h" class GpuEventSynchronizer; +struct gmx_wallcycle; namespace gmx { @@ -108,13 +109,15 @@ public: * \param[in] deviceContext Device context, nullptr allowed. * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). * \param[in] paddingSize Padding size for coordinates buffer. + * \param[in] wcycle Wall cycle counter data. */ StatePropagatorDataGpu(const void* pmeStream, const void* localStream, const void* nonLocalStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize); + int paddingSize, + gmx_wallcycle* wcycle); /*! \brief Constructor to use in PME-only rank and in tests. * @@ -130,11 +133,13 @@ public: * \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds. * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). * \param[in] paddingSize Padding size for coordinates buffer. + * \param[in] wcycle Wall cycle counter data. */ StatePropagatorDataGpu(const void* pmeStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize); + int paddingSize, + gmx_wallcycle* wcycle); //! Move constructor StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 9b201b9376..1e2eadaff4 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -59,7 +59,8 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */ const void* /* nonLocalStream */, const void* /* deviceContext */, GpuApiCallBehavior /* transferKind */, - int /* paddingSize */) : + int /* paddingSize */, + gmx_wallcycle* /* wcycle */) : impl_(nullptr) { } @@ -67,7 +68,8 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */, const void* /* deviceContext */, GpuApiCallBehavior /* transferKind */, - int /* paddingSize */) : + int /* paddingSize */, + gmx_wallcycle* /* wcycle */) : impl_(nullptr) { } diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index af073b6284..9ac9711d29 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -58,6 +58,8 @@ #include "gromacs/utility/classhelpers.h" #include "gromacs/utility/enumerationhelpers.h" +struct gmx_wallcycle; + namespace gmx { @@ -108,13 +110,15 @@ public: * \param[in] deviceContext Device context, nullptr allowed. * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). * \param[in] paddingSize Padding size for coordinates buffer. + * \param[in] wcycle Wall cycle counter data. */ Impl(const void* pmeStream, const void* localStream, const void* nonLocalStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize); + int paddingSize, + gmx_wallcycle* wcycle); /*! \brief Constructor to use in PME-only rank and in tests. * @@ -130,8 +134,13 @@ public: * \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds. * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). * \param[in] paddingSize Padding size for coordinates buffer. + * \param[in] wcycle Wall cycle counter data. */ - Impl(const void* pmeStream, const void* deviceContext, GpuApiCallBehavior transferKind, int paddingSize); + Impl(const void* pmeStream, + const void* deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle); ~Impl(); @@ -406,6 +415,9 @@ private: //! Allocation size for the force buffer int d_fCapacity_ = -1; + //! \brief Pointer to wallcycle structure. + gmx_wallcycle* wcycle_; + /*! \brief Performs the copy of data from host to device buffer. * * \todo Template on locality. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index f42ad7230e..29821a4313 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -55,10 +55,12 @@ # endif # include "gromacs/math/vectypes.h" # include "gromacs/mdtypes/state_propagator_data_gpu.h" +# include "gromacs/timing/wallcycle.h" # include "gromacs/utility/classhelpers.h" # include "state_propagator_data_gpu_impl.h" + namespace gmx { @@ -67,9 +69,11 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, const void* nonLocalStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize) : + int paddingSize, + gmx_wallcycle* wcycle) : transferKind_(transferKind), - paddingSize_(paddingSize) + paddingSize_(paddingSize), + wcycle_(wcycle) { static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); @@ -131,9 +135,11 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize) : + int paddingSize, + gmx_wallcycle* wcycle) : transferKind_(transferKind), - paddingSize_(paddingSize) + paddingSize_(paddingSize), + wcycle_(wcycle) { static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); @@ -171,6 +177,9 @@ StatePropagatorDataGpu::Impl::~Impl() {} void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + numAtomsLocal_ = numAtomsLocal; numAtomsAll_ = numAtomsAll; @@ -203,6 +212,9 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) { clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_); } + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) @@ -243,11 +255,13 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer AtomLocality atomLocality, CommandStream commandStream) { - GMX_UNUSED_VALUE(dataSize); GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated."); + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + int atomsStartAt, numAtomsToCopy; std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); @@ -264,6 +278,9 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr); } + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_data, @@ -272,11 +289,13 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat AtomLocality atomLocality, CommandStream commandStream) { - GMX_UNUSED_VALUE(dataSize); GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated."); + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + int atomsStartAt, numAtomsToCopy; std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); @@ -293,6 +312,9 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr); } + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() @@ -308,6 +330,9 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_ GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality."); + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream); fReadyOnHost_[atomLocality].markEvent(commandStream); + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality) { + wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA); fReadyOnHost_[atomLocality].waitForEvent(); + wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA); } void* StatePropagatorDataGpu::Impl::getUpdateStream() @@ -488,16 +554,18 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, const void* nonLocalStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize) : - impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize)) + int paddingSize, + gmx_wallcycle* wcycle) : + impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle)) { } StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, const void* deviceContext, GpuApiCallBehavior transferKind, - int paddingSize) : - impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize)) + int paddingSize, + gmx_wallcycle* wcycle) : + impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle)) { } diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index 59a55b8bd1..638b8dc475 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -132,6 +132,7 @@ static const char* wcn[ewcNR] = { "Run", "Reduce GPU PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", + "Wait GPU state copy", "NB X/F buffer ops.", "Vsite spread", "COM pull force", @@ -168,6 +169,7 @@ static const char* wcsn[ewcsNR] = { "Launch NB GPU tasks", "Launch Bonded GPU tasks", "Launch PME GPU tasks", + "Launch state copy", "Ewald F correction", "NB X buffer ops.", "NB F buffer ops.", diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index fa65919c60..c2d5bc8bf5 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -81,6 +81,7 @@ enum ewcPME_GPU_F_REDUCTION, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, + ewcWAIT_GPU_STATE_PROPAGATOR_DATA, ewcNB_XF_BUF_OPS, ewcVSITESPREAD, ewcPULLPOT, @@ -120,6 +121,7 @@ enum ewcsLAUNCH_GPU_NONBONDED, ewcsLAUNCH_GPU_BONDED, ewcsLAUNCH_GPU_PME, + ewcsLAUNCH_STATE_PROPAGATOR_DATA, ewcsEWALD_CORRECTION, ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS,