From 63131440b15a8412ae85444149a3458197beb788 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Mon, 22 Jun 2020 22:01:13 +0530 Subject: [PATCH] Fix cycle counters for "comm.coord" and "Wait + Comm. F" to support GPU halo exchange path Replace buffer ops counters with GPU launch counters for GPU backend. --- src/gromacs/domdec/domdec.cpp | 5 +-- src/gromacs/domdec/domdec.h | 4 ++- src/gromacs/domdec/gpuhaloexchange.h | 5 ++- src/gromacs/domdec/gpuhaloexchange_impl.cpp | 3 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 36 ++++++++++++++++++--- src/gromacs/domdec/gpuhaloexchange_impl.cuh | 8 ++++- src/gromacs/mdrun/md.cpp | 2 +- src/gromacs/nbnxm/nbnxm.cpp | 16 ++++----- src/gromacs/timing/wallcycle.cpp | 4 +++ src/gromacs/timing/wallcycle.h | 4 +++ 10 files changed, 68 insertions(+), 19 deletions(-) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 3e4996d3b6..fac3bf9a27 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -3202,7 +3202,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRefgpuHaloExchange.push_back(std::make_unique( cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(), deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal), - deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse)); + deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle)); } } } diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 6c9908536c..b6c79dca00 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -318,10 +318,12 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog, * \param[in] mdlog The logger object. * \param[in] cr The commrec object. * \param[in] deviceStreamManager Manager of the GPU context and streams. + * \param[in] wcycle The wallclock counter. */ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, - const gmx::DeviceStreamManager& deviceStreamManager); + const gmx::DeviceStreamManager& deviceStreamManager, + gmx_wallcycle* wcycle); /*! \brief * (Re-) Initialization for GPU halo exchange diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index b20ad0a180..9482009410 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -49,6 +49,7 @@ #include "gromacs/utility/gmxmpi.h" struct gmx_domdec_t; +struct gmx_wallcycle; class DeviceContext; class DeviceStream; class GpuEventSynchronizer; @@ -86,13 +87,15 @@ public: * \param [in] streamLocal local NB CUDA stream. * \param [in] streamNonLocal non-local NB CUDA stream. * \param [in] pulse the communication pulse for this instance + * \param [in] wcycle The wallclock counter */ GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& streamLocal, const DeviceStream& streamNonLocal, - int pulse); + int pulse, + gmx_wallcycle* wcycle); ~GpuHaloExchange(); /*! \brief diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index e9106b7c89..247c65b4e0 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -65,7 +65,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */, const DeviceContext& /* deviceContext */, const DeviceStream& /*streamLocal */, const DeviceStream& /*streamNonLocal */, - int /*pulse */) : + int /*pulse */, + gmx_wallcycle* /*wcycle*/) : impl_(nullptr) { GMX_ASSERT(false, diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 0eebc6c0e4..cbcff3defb 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -61,6 +61,7 @@ #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/math/vectypes.h" #include "gromacs/pbcutil/ishift.h" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/gmxmpi.h" #include "domdec_internal.h" @@ -205,6 +206,9 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); } + wallcycle_start(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX); + // launch kernel to pack send buffer KernelLaunchConfig config; config.blockSize[0] = c_threadsPerBlock; @@ -243,8 +247,17 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box "Domdec GPU Apply X Halo Exchange", kernelArgs); } + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + + // Consider time spent in communicateHaloData as Comm.X counter + // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync + wallcycle_start(wcycle_, ewcMOVEX); + communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent); + wallcycle_stop(wcycle_, ewcMOVEX); + return; } @@ -252,10 +265,18 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box // and before the local buffer operations. It operates in the non-local stream. void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) { + // Consider time spent in communicateHaloData as Comm.F counter + // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync + wallcycle_start(wcycle_, ewcMOVEF); // Communicate halo data (in non-local stream) communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr); + wallcycle_stop(wcycle_, ewcMOVEF); + + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF); + float3* d_f = d_f_; if (pulse_ == (dd_->comm->cd[0].numPulses() - 1)) @@ -313,6 +334,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) { fReadyOnDevice_.markEvent(nonLocalStream_); } + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } @@ -385,6 +409,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, { stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_.stream()); + CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed"); } @@ -419,7 +444,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, const DeviceContext& deviceContext, const DeviceStream& localStream, const DeviceStream& nonLocalStream, - int pulse) : + int pulse, + gmx_wallcycle* wcycle) : dd_(dd), sendRankX_(dd->neighbor[0][1]), recvRankX_(dd->neighbor[0][0]), @@ -431,7 +457,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, deviceContext_(deviceContext), localStream_(localStream), nonLocalStream_(nonLocalStream), - pulse_(pulse) + pulse_(pulse), + wcycle_(wcycle) { GMX_RELEASE_ASSERT(GMX_THREAD_MPI, @@ -466,8 +493,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, const DeviceContext& deviceContext, const DeviceStream& localStream, const DeviceStream& nonLocalStream, - int pulse) : - impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse)) + int pulse, + gmx_wallcycle* wcycle) : + impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle)) { } diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 9a033ed54f..77aaf3f9bf 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -52,6 +52,8 @@ #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/utility/gmxmpi.h" +struct gmx_wallcycle; + namespace gmx { @@ -75,13 +77,15 @@ public: * \param [in] localStream local NB CUDA stream * \param [in] nonLocalStream non-local NB CUDA stream * \param [in] pulse the communication pulse for this instance + * \param [in] wcycle The wallclock counter */ Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& localStream, const DeviceStream& nonLocalStream, - int pulse); + int pulse, + gmx_wallcycle* wcycle); ~Impl(); /*! \brief @@ -198,6 +202,8 @@ private: int pulse_ = 0; //! Number of zones. Always 1 for 1-D case. const int nzone_ = 1; + //! The wallclock counter + gmx_wallcycle* wcycle_ = nullptr; }; } // namespace gmx diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 9ee0697c84..5f5f863e81 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -874,7 +874,7 @@ void gmx::LegacySimulator::do_md() "GPU device manager has to be initialized to use GPU " "version of halo exchange."); // TODO remove need to pass local stream into GPU halo exchange - Issue #3093 - constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager); + constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle); } } } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 20495969e0..67890a9606 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -145,13 +145,13 @@ void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality, DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice) { - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); + wallcycle_start(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS); nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, d_x, xReadyOnDevice); - wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } gmx::ArrayRef nonbonded_verlet_t::getGridIndices() const @@ -198,14 +198,14 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality lo return; } - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); + wallcycle_start(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce); - wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone) diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index 8c628fab39..4e3e241d96 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -175,6 +175,10 @@ static const char* wcsn[ewcsNR] = { "NB X buffer ops.", "NB F buffer ops.", "Clear force buffer", + "Launch GPU NB X buffer ops.", + "Launch GPU NB F buffer ops.", + "Launch GPU Comm. coord.", + "Launch GPU Comm. force." "Test subcounter", }; diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 785b53cd8e..83be456671 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -127,6 +127,10 @@ enum ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS, ewcsCLEAR_FORCE_BUFFER, + ewcsLAUNCH_GPU_NB_X_BUF_OPS, + ewcsLAUNCH_GPU_NB_F_BUF_OPS, + ewcsLAUNCH_GPU_MOVEX, + ewcsLAUNCH_GPU_MOVEF, ewcsTEST, ewcsNR }; -- 2.22.0