From b80b7a327b1e8faeb5041021b08df26909b79f67 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 10 Nov 2020 12:33:23 +0000 Subject: [PATCH] GPU codepath cycle counting fixes * eliminated double-counting in bonded module initialization * moved the bonded kernel launch counting into the gpuBoned module * avoid counting ~0 CPU update work on the GPU path by conditionally starting the ewcUPDATE counter * also added some minimal improvements to the wallcycle debugging tooling which should facilitate finding invalid nesting and missing stop calls. Refs #3764 --- src/gromacs/listed_forces/gpubondedkernels.cu | 7 +++ src/gromacs/mdlib/sim_util.cpp | 18 +++--- src/gromacs/mdrun/md.cpp | 7 ++- src/gromacs/timing/wallcycle.cpp | 56 +++++++++++++------ 4 files changed, 58 insertions(+), 30 deletions(-) diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index d253241ee7..d8ad851446 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -61,6 +61,7 @@ #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/gmxassert.h" #include "gpubonded_impl.h" @@ -826,6 +827,9 @@ void GpuBonded::Impl::launchKernel() GMX_ASSERT(haveInteractions_, "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled"); + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED); + int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1]; if (fTypeRangeEnd < 0) @@ -839,6 +843,9 @@ void GpuBonded::Impl::launchKernel() launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "exec_kernel_gpu", kernelArgs); + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 18606f120f..9ef783b67a 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1341,11 +1341,12 @@ void do_force(FILE* fplog, /* initialize the GPU nbnxm atom data and bonded data structures */ if (simulationWork.useGpuNonbonded) { + // Note: cycle counting only nononbondeds, gpuBonded counts internally wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get()); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); if (fr->gpuBonded) { @@ -1361,7 +1362,6 @@ void do_force(FILE* fplog, nbv->getGridIndices(), top->idef, Nbnxm::gpu_get_xq(nbv->gpu_nbv), Nbnxm::gpu_get_f(nbv->gpu_nbv), Nbnxm::gpu_get_fshift(nbv->gpu_nbv)); } - wallcycle_stop(wcycle, ewcLAUNCH_GPU); } // Need to run after the GPU-offload bonded interaction lists @@ -1415,7 +1415,6 @@ void do_force(FILE* fplog, ddBalanceRegionHandler.openBeforeForceComputationGpu(); wallcycle_start(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get()); if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps) @@ -1423,18 +1422,18 @@ void do_force(FILE* fplog, Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local); } wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); // with X buffer ops offloaded to the GPU on all but the search steps // bonded work not split into separate local and non-local, so with DD // we can only launch the kernel after non-local coordinates have been received. if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr)) { - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED); fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED); } /* launch local nonbonded work on GPU */ + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); @@ -1517,28 +1516,27 @@ void do_force(FILE* fplog, if (simulationWork.useGpuNonbonded) { - wallcycle_start(wcycle, ewcLAUNCH_GPU); if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps) { + wallcycle_start(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); } if (domainWork.haveGpuBondedWork) { - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED); fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED); } /* launch non-local nonbonded tasks on GPU */ + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); } } @@ -1713,8 +1711,10 @@ void do_force(FILE* fplog, // TODO Force flags should include haveFreeEnergyWork for this domain if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)) { + wallcycle_stop(wcycle, ewcFORCE); /* Wait for non-local coordinate data to be copied from device */ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal); + wallcycle_start_nocount(wcycle, ewcFORCE); } // Compute wall interactions, when present. diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 507e2d0d91..792ca30550 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -1206,7 +1206,10 @@ void gmx::LegacySimulator::do_md() dvdl_constr = 0; - wallcycle_start(wcycle, ewcUPDATE); + if (!useGpuForUpdate) + { + wallcycle_start(wcycle, ewcUPDATE); + } /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */ if (bTrotter) { @@ -1256,8 +1259,6 @@ void gmx::LegacySimulator::do_md() if (useGpuForUpdate) { - wallcycle_stop(wcycle, ewcUPDATE); - if (bNS && (bFirstStep || DOMAINDECOMP(cr))) { integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index f1b428bf05..a80254755c 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -60,13 +60,22 @@ static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS; -/* DEBUG_WCYCLE adds consistency checking for the counters. - * It checks if you stop a counter different from the last +#ifndef DEBUG_WCYCLE +/*! \brief Enables consistency checking for the counters. + * + * If the macro is set to 1, code checks if you stop a counter different from the last * one that was opened and if you do nest too deep. */ -/* #define DEBUG_WCYCLE */ - -#ifdef DEBUG_WCYCLE +# define DEBUG_WCYCLE 0 +#endif +//! Whether wallcycle debugging is enabled +constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0); +//! True if only the master rank should print debugging output +constexpr bool gmx_unused onlyMasterDebugPrints = true; +//! True if cycle counter nesting depth debuggin prints are enabled +constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */; + +#if DEBUG_WCYCLE # include "gromacs/utility/fatalerror.h" #endif @@ -86,10 +95,11 @@ struct gmx_wallcycle gmx_bool wc_barrier; wallcc_t* wcc_all; int wc_depth; -#ifdef DEBUG_WCYCLE +#if DEBUG_WCYCLE # define DEPTH_MAX 6 - int counterlist[DEPTH_MAX]; - int count_depth; + int counterlist[DEPTH_MAX]; + int count_depth; + bool isMasterRank; #endif int ewc_prev; gmx_cycles_t cycle_prev; @@ -214,6 +224,7 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* wc->ewc_prev = -1; wc->reset_counters = resetstep; + #if GMX_MPI if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr) { @@ -241,8 +252,9 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* snew(wc->wcsc, ewcsNR); } -#ifdef DEBUG_WCYCLE - wc->count_depth = 0; +#if DEBUG_WCYCLE + wc->count_depth = 0; + wc->isMasterRank = MASTER(cr); #endif return wc; @@ -283,24 +295,32 @@ static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle) } -#ifdef DEBUG_WCYCLE +#if DEBUG_WCYCLE static void debug_start_check(gmx_wallcycle_t wc, int ewc) { - /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */ - if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX) { - gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth); + gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1); } wc->counterlist[wc->count_depth] = ewc; wc->count_depth++; + + if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank)) + { + std::string indentStr(4 * wc->count_depth, ' '); + fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]); + } } static void debug_stop_check(gmx_wallcycle_t wc, int ewc) { - wc->count_depth--; + if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank)) + { + std::string indentStr(4 * wc->count_depth, ' '); + fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]); + } - /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */ + wc->count_depth--; if (wc->count_depth < 0) { @@ -331,7 +351,7 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc) } #endif -#ifdef DEBUG_WCYCLE +#if DEBUG_WCYCLE debug_start_check(wc, ewc); #endif @@ -387,7 +407,7 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc) } #endif -#ifdef DEBUG_WCYCLE +#if DEBUG_WCYCLE debug_stop_check(wc, ewc); #endif -- 2.22.0