From 2fcd971786e3820dbd30e4dbc6e25ca546e4f27f Mon Sep 17 00:00:00 2001 From: Paul Bauer Date: Wed, 14 Apr 2021 13:52:51 +0000 Subject: [PATCH] Modernize wallcycle counting --- api/nblib/gmxsetup.cpp | 2 +- src/gromacs/applied_forces/awh/awh.cpp | 4 +- src/gromacs/domdec/cellsizes.cpp | 8 +- src/gromacs/domdec/cellsizes.h | 4 +- src/gromacs/domdec/domdec.cpp | 8 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 32 +- src/gromacs/domdec/partition.cpp | 36 +- src/gromacs/ewald/pme.cpp | 56 +- src/gromacs/ewald/pme_gpu.cpp | 78 +-- src/gromacs/ewald/pme_load_balancing.cpp | 4 +- src/gromacs/ewald/pme_load_balancing.h | 6 +- src/gromacs/ewald/pme_only.cpp | 12 +- src/gromacs/ewald/pme_pp.cpp | 4 +- src/gromacs/fft/fft5d.cpp | 4 +- src/gromacs/fft/fft5d.h | 4 +- src/gromacs/fft/parallel_3dfft.cpp | 4 +- src/gromacs/fft/parallel_3dfft.h | 4 +- src/gromacs/imd/imd.cpp | 12 +- src/gromacs/listed_forces/gpubonded_impl.cu | 16 +- src/gromacs/listed_forces/gpubondedkernels.cu | 8 +- src/gromacs/listed_forces/listed_forces.cpp | 16 +- .../listed_forces/position_restraints.cpp | 4 +- src/gromacs/mdlib/constr.cpp | 4 +- src/gromacs/mdlib/force.cpp | 10 +- src/gromacs/mdlib/gpuforcereduction_impl.cu | 12 +- src/gromacs/mdlib/md_support.cpp | 6 +- src/gromacs/mdlib/md_support.h | 2 +- src/gromacs/mdlib/mdoutf.cpp | 10 +- src/gromacs/mdlib/mdoutf.h | 4 +- src/gromacs/mdlib/resethandler.cpp | 10 +- src/gromacs/mdlib/sim_util.cpp | 156 ++--- src/gromacs/mdlib/trajectory_writing.cpp | 4 +- src/gromacs/mdlib/update.cpp | 20 +- src/gromacs/mdlib/update.h | 4 +- .../mdlib/update_constrain_gpu_impl.cu | 32 +- src/gromacs/mdlib/update_vv.cpp | 22 +- src/gromacs/mdlib/vsite.cpp | 4 +- src/gromacs/mdrun/md.cpp | 18 +- src/gromacs/mdrun/mimic.cpp | 10 +- src/gromacs/mdrun/minimize.cpp | 18 +- src/gromacs/mdrun/rerun.cpp | 6 +- src/gromacs/mdrun/runner.cpp | 34 +- src/gromacs/mdrun/shellfc.cpp | 2 +- src/gromacs/mdrun/shellfc.h | 2 +- src/gromacs/mdrun/tpi.cpp | 2 +- .../state_propagator_data_gpu_impl_gpu.cpp | 80 +-- src/gromacs/modularsimulator/propagator.cpp | 16 +- .../modularsimulator/simulatoralgorithm.cpp | 6 +- .../modularsimulator/statepropagatordata.cpp | 10 +- src/gromacs/nbnxm/gpu_common.h | 10 +- src/gromacs/nbnxm/kerneldispatch.cpp | 12 +- src/gromacs/nbnxm/nbnxm.cpp | 24 +- src/gromacs/nbnxm/prunekerneldispatch.cpp | 8 +- src/gromacs/swap/swapcoords.cpp | 4 +- src/gromacs/timing/tests/timing.cpp | 44 +- src/gromacs/timing/wallcycle.cpp | 551 ++++++++++-------- src/gromacs/timing/wallcycle.h | 296 +++++----- src/gromacs/timing/wallcyclereporting.h | 10 +- 58 files changed, 919 insertions(+), 870 deletions(-) diff --git a/api/nblib/gmxsetup.cpp b/api/nblib/gmxsetup.cpp index fd3cfd2a87..007e0cf3db 100644 --- a/api/nblib/gmxsetup.cpp +++ b/api/nblib/gmxsetup.cpp @@ -206,7 +206,7 @@ void NbvSetupUtil::setupNbnxmInstance(const size_t numParticleTypes, const NBKer // Put everything together auto nbv = std::make_unique( - std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullWallcycle); + std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullptr); gmxForceCalculator_->nbv_ = std::move(nbv); } diff --git a/src/gromacs/applied_forces/awh/awh.cpp b/src/gromacs/applied_forces/awh/awh.cpp index 51948de02b..6f2a8db5ea 100644 --- a/src/gromacs/applied_forces/awh/awh.cpp +++ b/src/gromacs/applied_forces/awh/awh.cpp @@ -304,7 +304,7 @@ real Awh::applyBiasForcesAndUpdateBias(PbcType pbcType, GMX_ASSERT(forceWithVirial, "Need a valid ForceWithVirial object"); } - wallcycle_start(wallcycle, ewcAWH); + wallcycle_start(wallcycle, WallCycleCounter::Awh); t_pbc pbc; set_pbc(&pbc, pbcType, box); @@ -394,7 +394,7 @@ real Awh::applyBiasForcesAndUpdateBias(PbcType pbcType, } } - wallcycle_stop(wallcycle, ewcAWH); + wallcycle_stop(wallcycle, WallCycleCounter::Awh); return MASTER(commRecord_) ? static_cast(awhPotential) : 0; } diff --git a/src/gromacs/domdec/cellsizes.cpp b/src/gromacs/domdec/cellsizes.cpp index 37813d8aba..ed1e6dbbcd 100644 --- a/src/gromacs/domdec/cellsizes.cpp +++ b/src/gromacs/domdec/cellsizes.cpp @@ -848,15 +848,15 @@ static void set_dd_cell_sizes_dlb(gmx_domdec_t* dd, gmx_bool bUniform, gmx_bool bDoDLB, int64_t step, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { gmx_domdec_comm_t* comm = dd->comm; if (bDoDLB) { - wallcycle_start(wcycle, ewcDDCOMMBOUND); + wallcycle_start(wcycle, WallCycleCounter::DDCommBound); set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step); - wallcycle_stop(wcycle, ewcDDCOMMBOUND); + wallcycle_stop(wcycle, WallCycleCounter::DDCommBound); } else if (bDynamicBox) { @@ -885,7 +885,7 @@ void set_dd_cell_sizes(gmx_domdec_t* dd, gmx_bool bUniform, gmx_bool bDoDLB, int64_t step, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { gmx_domdec_comm_t* comm = dd->comm; diff --git a/src/gromacs/domdec/cellsizes.h b/src/gromacs/domdec/cellsizes.h index cf07585e89..794eb27457 100644 --- a/src/gromacs/domdec/cellsizes.h +++ b/src/gromacs/domdec/cellsizes.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -84,6 +84,6 @@ void set_dd_cell_sizes(gmx_domdec_t* dd, gmx_bool bUniform, gmx_bool bDoDLB, int64_t step, - gmx_wallcycle_t wcycle); + gmx_wallcycle* wcycle); #endif diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 088189b602..4f7cab7862 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -258,7 +258,7 @@ void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end) void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef x, gmx_wallcycle* wcycle) { - wallcycle_start(wcycle, ewcMOVEX); + wallcycle_start(wcycle, WallCycleCounter::MoveX); rvec shift = { 0, 0, 0 }; @@ -347,12 +347,12 @@ void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef x, g nzone += nzone; } - wallcycle_stop(wcycle, ewcMOVEX); + wallcycle_stop(wcycle, WallCycleCounter::MoveX); } void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle) { - wallcycle_start(wcycle, ewcMOVEF); + wallcycle_start(wcycle, WallCycleCounter::MoveF); gmx::ArrayRef f = forceWithShiftForces->force(); gmx::ArrayRef fshift = forceWithShiftForces->shiftForces(); @@ -456,7 +456,7 @@ void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces } nzone /= 2; } - wallcycle_stop(wcycle, ewcMOVEF); + wallcycle_stop(wcycle, WallCycleCounter::MoveF); } /* Convenience function for extracting a real buffer from an rvec buffer diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 65af08d35d..f32a95bae7 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -134,8 +134,8 @@ __global__ void unpackRecvBufKernel(float3* __restrict__ data, void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer) { - wallcycle_start(wcycle_, ewcDOMDEC); - wallcycle_sub_start(wcycle_, ewcsDD_GPU); + wallcycle_start(wcycle_, WallCycleCounter::Domdec); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::DDGpu); d_x_ = d_coordinatesBuffer; d_f_ = d_forcesBuffer; @@ -249,8 +249,8 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo } #endif - wallcycle_sub_stop(wcycle_, ewcsDD_GPU); - wallcycle_stop(wcycle_, ewcDOMDEC); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu); + wallcycle_stop(wcycle_, WallCycleCounter::Domdec); return; } @@ -283,14 +283,14 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { - wallcycle_start(wcycle_, ewcLAUNCH_GPU); + wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu); if (pulse_ == 0) { // ensure stream waits until coordinate data is available on device coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); } - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX); // launch kernel to pack send buffer KernelLaunchConfig config; @@ -328,12 +328,12 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs); } - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); // Consider time spent in communicateHaloData as Comm.X counter // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync - wallcycle_start(wcycle_, ewcMOVEX); + wallcycle_start(wcycle_, WallCycleCounter::MoveX); // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated @@ -345,7 +345,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_]; communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_); - wallcycle_stop(wcycle_, ewcMOVEX); + wallcycle_stop(wcycle_, WallCycleCounter::MoveX); return; } @@ -356,17 +356,17 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) { // Consider time spent in communicateHaloData as Comm.F counter // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync - wallcycle_start(wcycle_, ewcMOVEF); + wallcycle_start(wcycle_, WallCycleCounter::MoveF); float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_; // Communicate halo data (in non-local stream) communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_); - wallcycle_stop(wcycle_, ewcMOVEF); + wallcycle_stop(wcycle_, WallCycleCounter::MoveF); - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF); float3* d_f = d_f_; // If this is the last pulse and index (noting the force halo @@ -422,8 +422,8 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) fReadyOnDevice_.markEvent(nonLocalStream_); } - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr, diff --git a/src/gromacs/domdec/partition.cpp b/src/gromacs/domdec/partition.cpp index c87a3077fd..2aa7c35c97 100644 --- a/src/gromacs/domdec/partition.cpp +++ b/src/gromacs/domdec/partition.cpp @@ -754,7 +754,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cel } //! Compute and communicate to determine the load distribution across PP ranks. -static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle) +static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle) { gmx_domdec_comm_t* comm; domdec_load_t* load; @@ -766,7 +766,7 @@ static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle) fprintf(debug, "get_load_distribution start\n"); } - wallcycle_start(wcycle, ewcDDCOMMLOAD); + wallcycle_start(wcycle, WallCycleCounter::DDCommLoad); comm = dd->comm; @@ -937,7 +937,7 @@ static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle) } } - wallcycle_stop(wcycle, ewcDDCOMMLOAD); + wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad); if (debug) { @@ -2760,7 +2760,7 @@ void dd_partition_system(FILE* fplog, int ncgindex_set; char sbuf[22]; - wallcycle_start(wcycle, ewcDOMDEC); + wallcycle_start(wcycle, WallCycleCounter::Domdec); gmx_domdec_t* dd = cr->dd; gmx_domdec_comm_t* comm = dd->comm; @@ -3059,7 +3059,7 @@ void dd_partition_system(FILE* fplog, int ncg_moved = 0; if (bRedist) { - wallcycle_sub_start(wcycle, ewcsDD_REDIST); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist); ncgindex_set = dd->ncg_home; dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved); @@ -3073,7 +3073,7 @@ void dd_partition_system(FILE* fplog, state_local->x); } - wallcycle_sub_stop(wcycle, ewcsDD_REDIST); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist); } RVec cell_ns_x0, cell_ns_x1; @@ -3095,7 +3095,7 @@ void dd_partition_system(FILE* fplog, if (bSortCG) { - wallcycle_sub_start(wcycle, ewcsDD_GRID); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid); /* Sort the state on charge group position. * This enables exact restarts from this step. @@ -3136,7 +3136,7 @@ void dd_partition_system(FILE* fplog, dd->ga2la->clear(); ncgindex_set = 0; - wallcycle_sub_stop(wcycle, ewcsDD_GRID); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid); } else { @@ -3157,7 +3157,7 @@ void dd_partition_system(FILE* fplog, comm->updateGroupsCog->clear(); } - wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm); /* Set the induces for the home atoms */ set_zones_ncg_home(dd); @@ -3175,14 +3175,14 @@ void dd_partition_system(FILE* fplog, /* When bSortCG=true, we have already set the size for zone 0 */ set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0); - wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm); /* write_dd_pdb("dd_home",step,"dump",top_global,cr, -1,state_local->x.rvec_array(),state_local->box); */ - wallcycle_sub_start(wcycle, ewcsDD_MAKETOP); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop); /* Extract a local topology from the global topology */ IVec numPulses; @@ -3201,9 +3201,9 @@ void dd_partition_system(FILE* fplog, top_global, top_local); - wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop); - wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr); /* Set up the special atom communication */ int n = comm->atomRanges.end(DDAtomRanges::Type::Zones); @@ -3238,9 +3238,9 @@ void dd_partition_system(FILE* fplog, comm->atomRanges.setEnd(range, n); } - wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr); - wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER); + wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther); /* Make space for the extra coordinates for virtual site * or constraint communication. @@ -3325,11 +3325,11 @@ void dd_partition_system(FILE* fplog, */ dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array()); - wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther); if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0) { - dd_move_x(dd, state_local->box, state_local->x, nullWallcycle); + dd_move_x(dd, state_local->box, state_local->x, nullptr); write_dd_pdb("dd_dump", step, "dump", @@ -3361,7 +3361,7 @@ void dd_partition_system(FILE* fplog, check_index_consistency(dd, top_global.natoms, "after partitioning"); } - wallcycle_stop(wcycle, ewcDOMDEC); + wallcycle_stop(wcycle, WallCycleCounter::Domdec); } } // namespace gmx diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index bd96d59070..5f9ffb42d9 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -1202,10 +1202,10 @@ int gmx_pme_do(struct gmx_pme_t* pme, } else { - wallcycle_start(wcycle, ewcPME_REDISTXF); + wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF); do_redist_pos_coeffs(pme, cr, bFirst, coordinates, coefficient); - wallcycle_stop(wcycle, ewcPME_REDISTXF); + wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF); } if (debug) @@ -1213,7 +1213,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, fprintf(debug, "Rank= %6d, pme local particles=%6d\n", cr->nodeid, atc.numAtoms()); } - wallcycle_start(wcycle, ewcPME_SPREAD); + wallcycle_start(wcycle, WallCycleCounter::PmeSpread); /* Spread the coefficients on a grid */ spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index); @@ -1237,7 +1237,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index); } - wallcycle_stop(wcycle, ewcPME_SPREAD); + wallcycle_stop(wcycle, WallCycleCounter::PmeSpread); /* TODO If the OpenMP and single-threaded implementations converge, then spread_on_grid() and @@ -1256,18 +1256,20 @@ int gmx_pme_do(struct gmx_pme_t* pme, /* do 3d-fft */ if (thread == 0) { - wallcycle_start(wcycle, ewcPME_FFT); + wallcycle_start(wcycle, WallCycleCounter::PmeFft); } gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle); if (thread == 0) { - wallcycle_stop(wcycle, ewcPME_FFT); + wallcycle_stop(wcycle, WallCycleCounter::PmeFft); } /* solve in k-space for our local cells */ if (thread == 0) { - wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME)); + wallcycle_start( + wcycle, + (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme)); } if (grid_index < DO_Q) { @@ -1292,19 +1294,21 @@ int gmx_pme_do(struct gmx_pme_t* pme, if (thread == 0) { - wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME)); + wallcycle_stop( + wcycle, + (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme)); inc_nrnb(nrnb, eNR_SOLVEPME, loop_count); } /* do 3d-invfft */ if (thread == 0) { - wallcycle_start(wcycle, ewcPME_FFT); + wallcycle_start(wcycle, WallCycleCounter::PmeFft); } gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle); if (thread == 0) { - wallcycle_stop(wcycle, ewcPME_FFT); + wallcycle_stop(wcycle, WallCycleCounter::PmeFft); if (pme->nodeid == 0) @@ -1317,7 +1321,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, /* Note: this wallcycle region is closed below outside an OpenMP region, so take care if refactoring code here. */ - wallcycle_start(wcycle, ewcPME_GATHER); + wallcycle_start(wcycle, WallCycleCounter::PmeGather); } copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread); @@ -1366,7 +1370,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, inc_nrnb(nrnb, eNR_GATHERFBSP, pme->pme_order * pme->pme_order * pme->pme_order * atc.numAtoms()); /* Note: this wallcycle region is opened above inside an OpenMP region, so take care if refactoring code here. */ - wallcycle_stop(wcycle, ewcPME_GATHER); + wallcycle_stop(wcycle, WallCycleCounter::PmeGather); } if (computeEnergyAndVirial) @@ -1431,7 +1435,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, break; default: gmx_incons("Trying to access wrong FEP-state in LJ-PME routine"); } - wallcycle_start(wcycle, ewcPME_REDISTXF); + wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF); do_redist_pos_coeffs(pme, cr, bFirst, coordinates, RedistC6); pme->lb_buf1.resize(atc.numAtoms()); @@ -1449,7 +1453,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, local_sigma[i] = atc.coefficient[i]; } - wallcycle_stop(wcycle, ewcPME_REDISTXF); + wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF); } atc.coefficient = coefficientBuffer; calc_initial_lb_coeffs(coefficientBuffer, local_c6, local_sigma); @@ -1464,7 +1468,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, calc_next_lb_coeffs(coefficientBuffer, local_sigma); grid = pmegrid->grid.grid; - wallcycle_start(wcycle, ewcPME_SPREAD); + wallcycle_start(wcycle, WallCycleCounter::PmeSpread); /* Spread the c6 on a grid */ spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index); @@ -1486,7 +1490,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, } copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index); } - wallcycle_stop(wcycle, ewcPME_SPREAD); + wallcycle_stop(wcycle, WallCycleCounter::PmeSpread); /*Here we start a large thread parallel region*/ #pragma omp parallel num_threads(pme->nthread) private(thread) @@ -1497,13 +1501,13 @@ int gmx_pme_do(struct gmx_pme_t* pme, /* do 3d-fft */ if (thread == 0) { - wallcycle_start(wcycle, ewcPME_FFT); + wallcycle_start(wcycle, WallCycleCounter::PmeFft); } gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle); if (thread == 0) { - wallcycle_stop(wcycle, ewcPME_FFT); + wallcycle_stop(wcycle, WallCycleCounter::PmeFft); } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR @@ -1519,7 +1523,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, thread = gmx_omp_get_thread_num(); if (thread == 0) { - wallcycle_start(wcycle, ewcLJPME); + wallcycle_start(wcycle, WallCycleCounter::LJPme); } loop_count = @@ -1532,7 +1536,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, thread); if (thread == 0) { - wallcycle_stop(wcycle, ewcLJPME); + wallcycle_stop(wcycle, WallCycleCounter::LJPme); inc_nrnb(nrnb, eNR_SOLVEPME, loop_count); } } @@ -1565,13 +1569,13 @@ int gmx_pme_do(struct gmx_pme_t* pme, /* do 3d-invfft */ if (thread == 0) { - wallcycle_start(wcycle, ewcPME_FFT); + wallcycle_start(wcycle, WallCycleCounter::PmeFft); } gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle); if (thread == 0) { - wallcycle_stop(wcycle, ewcPME_FFT); + wallcycle_stop(wcycle, WallCycleCounter::PmeFft); if (pme->nodeid == 0) @@ -1580,7 +1584,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, npme = static_cast(ntot * std::log(ntot) / std::log(2.0)); inc_nrnb(nrnb, eNR_FFT, 2 * npme); } - wallcycle_start(wcycle, ewcPME_GATHER); + wallcycle_start(wcycle, WallCycleCounter::PmeGather); } copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread); @@ -1619,7 +1623,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, eNR_GATHERFBSP, pme->pme_order * pme->pme_order * pme->pme_order * pme->atc[0].numAtoms()); } - wallcycle_stop(wcycle, ewcPME_GATHER); + wallcycle_stop(wcycle, WallCycleCounter::PmeGather); bFirst = FALSE; } /* for (grid_index = 8; grid_index >= 2; --grid_index) */ @@ -1628,7 +1632,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, if (stepWork.computeForces && pme->nnodes > 1) { - wallcycle_start(wcycle, ewcPME_REDISTXF); + wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF); for (d = 0; d < pme->ndecompdim; d++) { gmx::ArrayRef forcesRef; @@ -1648,7 +1652,7 @@ int gmx_pme_do(struct gmx_pme_t* pme, } } - wallcycle_stop(wcycle, ewcPME_REDISTXF); + wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF); } if (computeEnergyAndVirial) diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 225fb1050a..564e213af9 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -123,25 +123,25 @@ int pme_gpu_get_block_size(const gmx_pme_t* pme) void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t* pme, const int gridIndex, enum gmx_fft_direction dir, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { if (pme_gpu_settings(pme->gpu).performGPUFFT) { - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); pme_gpu_3dfft(pme->gpu, dir, gridIndex); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } else { - wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE); + wallcycle_start(wcycle, WallCycleCounter::PmeFftMixedMode); #pragma omp parallel for num_threads(pme->nthread) schedule(static) for (int thread = 0; thread < pme->nthread; thread++) { gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle); } - wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE); + wallcycle_stop(wcycle, WallCycleCounter::PmeFftMixedMode); } } @@ -172,11 +172,11 @@ void pme_gpu_prepare_computation(gmx_pme_t* pme, if (stepWork.haveDynamicBox || shouldUpdateBox) // || is to make the first computation always update { - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); pme_gpu_update_input_box(pmeGpu, box); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); if (!pme_gpu_settings(pmeGpu).performGPUSolve) { @@ -213,11 +213,11 @@ void pme_gpu_launch_spread(gmx_pme_t* pme, /* Spread the coefficients on a grid */ const bool computeSplines = true; const bool spreadCharges = true; - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork) @@ -228,9 +228,9 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial; if (!settings.performGPUFFT) { - wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD); + wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeSpread); pme_gpu_sync_spread_grid(pme->gpu); - wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD); + wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeSpread); } try @@ -248,21 +248,21 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co { const auto gridOrdering = settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ; - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } else { - wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE); + wallcycle_start(wcycle, WallCycleCounter::PmeSolveMixedMode); #pragma omp parallel for num_threads(pme->nthread) schedule(static) for (int thread = 0; thread < pme->nthread; thread++) { solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread); } - wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE); + wallcycle_stop(wcycle, WallCycleCounter::PmeSolveMixedMode); } parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle); @@ -280,13 +280,13 @@ void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycl return; } - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); float** fftgrids = pme->fftgrid; pme_gpu_gather(pme->gpu, fftgrids, lambdaQ); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } //! Accumulate the \c forcesToAdd to \c f, using the available threads. @@ -309,7 +309,7 @@ static void pme_gpu_reduce_outputs(const bool computeEnergyAndVirial, gmx::ForceWithVirial* forceWithVirial, gmx_enerdata_t* enerd) { - wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION); + wallcycle_start(wcycle, WallCycleCounter::PmeGpuFReduction); GMX_ASSERT(forceWithVirial, "Invalid force pointer"); if (computeEnergyAndVirial) @@ -323,7 +323,7 @@ static void pme_gpu_reduce_outputs(const bool computeEnergyAndVirial, { sum_forces(forceWithVirial->force_, output.forces_); } - wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION); + wallcycle_stop(wcycle, WallCycleCounter::PmeGpuFReduction); } bool pme_gpu_try_finish_task(gmx_pme_t* pme, @@ -348,11 +348,11 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, // TODO: implement c_streamQuerySupported with an additional GpuEventSynchronizer per stream (#2521) if ((completionKind == GpuTaskCompletion::Check) && c_streamQuerySupported) { - wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuPmeGather); // Query the PME stream for completion of all tasks enqueued and // if we're not done, stop the timer before early return. const bool pmeGpuDone = pme_gpu_stream_query(pme->gpu); - wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather); if (!pmeGpuDone) { @@ -361,7 +361,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, needToSynchronize = false; } - wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather); // If the above check passed, then there is no need to make an // explicit synchronization call. if (needToSynchronize) @@ -374,7 +374,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial; PmeOutput output = pme_gpu_getOutput( *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0); - wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather); GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_, "When forces are reduced on the CPU, there needs to be force output"); @@ -391,7 +391,7 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); - wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather); // Synchronize the whole PME stream at once, including D2H result transfers // if there are outputs we need to wait for at this step; we still call getOutputs @@ -403,7 +403,7 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, PmeOutput output = pme_gpu_getOutput( *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0); - wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); + wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather); return output; } @@ -428,16 +428,16 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); pme_gpu_update_timings(pme->gpu); pme_gpu_clear_grids(pme->gpu); pme_gpu_clear_energy_virial(pme->gpu); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } DeviceBuffer pme_gpu_get_device_f(const gmx_pme_t* pme) diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp index 19778b8999..2e900d3bd4 100644 --- a/src/gromacs/ewald/pme_load_balancing.cpp +++ b/src/gromacs/ewald/pme_load_balancing.cpp @@ -928,7 +928,7 @@ void pme_loadbal_do(pme_load_balancing_t* pme_lb, t_forcerec* fr, const matrix box, gmx::ArrayRef x, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, int64_t step, int64_t step_rel, gmx_bool* bPrinting, @@ -946,7 +946,7 @@ void pme_loadbal_do(pme_load_balancing_t* pme_lb, n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; - wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); + wallcycle_get(wcycle, WallCycleCounter::Step, &pme_lb->cycles_n, &pme_lb->cycles_c); /* Before the first step we haven't done any steps yet. * Also handle cases where ir.init_step % ir.nstlist != 0. diff --git a/src/gromacs/ewald/pme_load_balancing.h b/src/gromacs/ewald/pme_load_balancing.h index bb98635ca2..38ba5c74e6 100644 --- a/src/gromacs/ewald/pme_load_balancing.h +++ b/src/gromacs/ewald/pme_load_balancing.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -92,7 +92,7 @@ void pme_loadbal_init(pme_load_balancing_t** pme_lb_p, * * Process the cycles measured over the last nstlist steps and then * either continue balancing or check if we need to trigger balancing. - * Should be called after the ewcSTEP cycle counter has been stopped. + * Should be called after the WallCycleCounter::Step cycle counter has been stopped. * Returns if the load balancing is printing to fp_err. */ void pme_loadbal_do(pme_load_balancing_t* pme_lb, @@ -104,7 +104,7 @@ void pme_loadbal_do(pme_load_balancing_t* pme_lb, t_forcerec* fr, const matrix box, gmx::ArrayRef x, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, int64_t step, int64_t step_rel, gmx_bool* bPrinting, diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 4542a05f42..5130034a9f 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -167,17 +167,17 @@ static std::unique_ptr gmx_pme_pp_init(const t_commrec* cr) return pme_pp; } -static void reset_pmeonly_counters(gmx_wallcycle_t wcycle, +static void reset_pmeonly_counters(gmx_wallcycle* wcycle, gmx_walltime_accounting_t walltime_accounting, t_nrnb* nrnb, int64_t step, bool useGpuForPme) { /* Reset all the counters related to performance over the run */ - wallcycle_stop(wcycle, ewcRUN); + wallcycle_stop(wcycle, WallCycleCounter::Run); wallcycle_reset_all(wcycle); *nrnb = { 0 }; - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); walltime_accounting_reset_time(walltime_accounting, step); if (useGpuForPme) @@ -708,11 +708,11 @@ int gmx_pmeonly(struct gmx_pme_t* pme, if (count == 0) { - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); walltime_accounting_start_time(walltime_accounting); } - wallcycle_start(wcycle, ewcPMEMESH); + wallcycle_start(wcycle, WallCycleCounter::PmeMesh); dvdlambda_q = 0; dvdlambda_lj = 0; @@ -779,7 +779,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, output.forces_ = pme_pp->f; } - cycles = wallcycle_stop(wcycle, ewcPMEMESH); + cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh); gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles); count++; diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp index 3a929daf1f..63693ed6a6 100644 --- a/src/gromacs/ewald/pme_pp.cpp +++ b/src/gromacs/ewald/pme_pp.cpp @@ -371,7 +371,7 @@ void gmx_pme_send_coordinates(t_forcerec* fr, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent, gmx_wallcycle* wcycle) { - wallcycle_start(wcycle, ewcPP_PMESENDX); + wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX); unsigned int flags = PP_PME_COORD; if (computeEnergyAndVirial) @@ -399,7 +399,7 @@ void gmx_pme_send_coordinates(t_forcerec* fr, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent); - wallcycle_stop(wcycle, ewcPP_PMESENDX); + wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX); } void gmx_pme_send_finish(const t_commrec* cr) diff --git a/src/gromacs/fft/fft5d.cpp b/src/gromacs/fft/fft5d.cpp index 71020eec6b..be74cbaf6d 100644 --- a/src/gromacs/fft/fft5d.cpp +++ b/src/gromacs/fft/fft5d.cpp @@ -1286,7 +1286,7 @@ void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times) time = MPI_Wtime(); } #else - wallcycle_start(times, ewcPME_FFTCOMM); + wallcycle_start(times, WallCycleCounter::PmeFftComm); #endif #ifdef FFT5D_MPI_TRANSPOSE FFTW(execute)(mpip[s]); @@ -1323,7 +1323,7 @@ void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times) time_mpi[s] = MPI_Wtime() - time; } #else - wallcycle_stop(times, ewcPME_FFTCOMM); + wallcycle_stop(times, WallCycleCounter::PmeFftComm); #endif } /*master*/ } /* bPrallelDim */ diff --git a/src/gromacs/fft/fft5d.h b/src/gromacs/fft/fft5d.h index 71d86817a0..b4e5d008e4 100644 --- a/src/gromacs/fft/fft5d.h +++ b/src/gromacs/fft/fft5d.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2009-2017, The GROMACS development team. - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -73,7 +73,7 @@ struct fft5d_time_t typedef struct fft5d_time_t* fft5d_time; #else # include "gromacs/timing/wallcycle.h" -typedef gmx_wallcycle_t fft5d_time; +typedef gmx_wallcycle* fft5d_time; #endif namespace gmx diff --git a/src/gromacs/fft/parallel_3dfft.cpp b/src/gromacs/fft/parallel_3dfft.cpp index 24a679fc30..33d916ec20 100644 --- a/src/gromacs/fft/parallel_3dfft.cpp +++ b/src/gromacs/fft/parallel_3dfft.cpp @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen. - * Copyright (c) 2013,2014,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -170,7 +170,7 @@ int gmx_parallel_3dfft_complex_limits(gmx_parallel_3dfft_t pfft_setup, int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup, enum gmx_fft_direction dir, int thread, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { if (((pfft_setup->p1->flags & FFT5D_REALCOMPLEX) == 0) ^ (dir == GMX_FFT_FORWARD || dir == GMX_FFT_BACKWARD)) diff --git a/src/gromacs/fft/parallel_3dfft.h b/src/gromacs/fft/parallel_3dfft.h index 91cd66103f..5652f4c422 100644 --- a/src/gromacs/fft/parallel_3dfft.h +++ b/src/gromacs/fft/parallel_3dfft.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen. - * Copyright (c) 2013,2014,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2017,2018,2019,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -104,7 +104,7 @@ int gmx_parallel_3dfft_complex_limits(gmx_parallel_3dfft_t pfft_setup, int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup, enum gmx_fft_direction dir, int thread, - gmx_wallcycle_t wcycle); + gmx_wallcycle* wcycle); /*! \brief Release all data in parallel fft setup diff --git a/src/gromacs/imd/imd.cpp b/src/gromacs/imd/imd.cpp index cf475be7cc..33285ca106 100644 --- a/src/gromacs/imd/imd.cpp +++ b/src/gromacs/imd/imd.cpp @@ -1525,7 +1525,7 @@ bool ImdSession::Impl::run(int64_t step, bool bNS, const matrix box, gmx::ArrayR return false; } - wallcycle_start(wcycle, ewcIMD); + wallcycle_start(wcycle, WallCycleCounter::Imd); /* read command from client and check if new incoming connection */ if (MASTER(cr)) @@ -1576,7 +1576,7 @@ bool ImdSession::Impl::run(int64_t step, bool bNS, const matrix box, gmx::ArrayR } } - wallcycle_stop(wcycle, ewcIMD); + wallcycle_stop(wcycle, WallCycleCounter::Imd); return imdstep; } @@ -1641,7 +1641,7 @@ void ImdSession::updateEnergyRecordAndSendPositionsAndEnergies(bool bIMDstep, in return; } - wallcycle_start(impl_->wcycle, ewcIMD); + wallcycle_start(impl_->wcycle, WallCycleCounter::Imd); /* Update time step for IMD and prepare IMD energy record if we have new energies. */ fillEnergyRecord(step, bHaveNewEnergies); @@ -1652,7 +1652,7 @@ void ImdSession::updateEnergyRecordAndSendPositionsAndEnergies(bool bIMDstep, in sendPositionsAndEnergies(); } - wallcycle_stop(impl_->wcycle, ewcIMD); + wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd); } void ImdSession::applyForces(gmx::ArrayRef force) @@ -1662,7 +1662,7 @@ void ImdSession::applyForces(gmx::ArrayRef force) return; } - wallcycle_start(impl_->wcycle, ewcIMD); + wallcycle_start(impl_->wcycle, WallCycleCounter::Imd); for (int i = 0; i < impl_->nforces; i++) { @@ -1680,7 +1680,7 @@ void ImdSession::applyForces(gmx::ArrayRef force) rvec_inc(force[j], impl_->f[i]); } - wallcycle_stop(impl_->wcycle, ewcIMD); + wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd); } ImdSession::ImdSession(const MDLogger& mdlog) : impl_(new Impl(mdlog)) {} diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index 4a8bbe41ce..c5fbd00e46 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -311,11 +311,11 @@ void GpuBonded::Impl::launchEnergyTransfer() GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should " "not be called"); - wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED); + wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded); // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage) float* h_vTot = vTot_.data(); copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded); } void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd) @@ -324,10 +324,10 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd) "No GPU bonded interactions, so no energies will be computed or transferred, so " "accumulation should not occur"); - wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED); + wallcycle_start(wcycle_, WallCycleCounter::WaitGpuBonded); cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream()); CU_RET_ERR(stat, "D2H transfer of bonded energies failed"); - wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED); + wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuBonded); for (int fType : fTypesOnGpu) { @@ -346,11 +346,11 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd) void GpuBonded::Impl::clearEnergies() { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded); clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } // ---- GpuBonded diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index 57fbbc1be4..1537c58d58 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -914,8 +914,8 @@ void GpuBonded::Impl::launchKernel() GMX_ASSERT(haveInteractions_, "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled"); - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuBonded); int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1]; @@ -935,8 +935,8 @@ void GpuBonded::Impl::launchKernel() "exec_kernel_gpu", kernelArgs); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork) diff --git a/src/gromacs/listed_forces/listed_forces.cpp b/src/gromacs/listed_forces/listed_forces.cpp index b0d83be843..a35e58911c 100644 --- a/src/gromacs/listed_forces/listed_forces.cpp +++ b/src/gromacs/listed_forces/listed_forces.cpp @@ -657,7 +657,7 @@ void calc_listed(struct gmx_wallcycle* wcycle, { gmx::ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces(); - wallcycle_sub_start(wcycle, ewcsLISTED); + wallcycle_sub_start(wcycle, WallCycleSubCounter::Listed); /* The dummy array is to have a place to store the dhdl at other values of lambda, which will be thrown away in the end */ gmx::EnumerationArray dvdl = { 0 }; @@ -675,9 +675,9 @@ void calc_listed(struct gmx_wallcycle* wcycle, fcd, stepWork, global_atom_index); - wallcycle_sub_stop(wcycle, ewcsLISTED); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::Listed); - wallcycle_sub_start(wcycle, ewcsLISTED_BUF_OPS); + wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedBufOps); reduce_thread_output(&forceWithShiftForces, enerd->term.data(), &enerd->grpp, dvdl, bt, stepWork); if (stepWork.computeDhdl) @@ -687,7 +687,7 @@ void calc_listed(struct gmx_wallcycle* wcycle, enerd->dvdl_nonlin[i] += dvdl[i]; } } - wallcycle_sub_stop(wcycle, ewcsLISTED_BUF_OPS); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedBufOps); } /* Copy the sum of violations for the distance restraints from fcd */ @@ -829,7 +829,7 @@ void ListedForces::calculate(struct gmx_wallcycle* wcycle, awkward to account to this subtimer properly in the present code. We don't test / care much about performance with restraints, anyway. */ - wallcycle_sub_start(wcycle, ewcsRESTRAINTS); + wallcycle_sub_start(wcycle, WallCycleSubCounter::Restraints); if (!idef.il[F_POSRES].empty()) { @@ -868,7 +868,7 @@ void ListedForces::calculate(struct gmx_wallcycle* wcycle, hist); } - wallcycle_sub_stop(wcycle, ewcsRESTRAINTS); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints); } calc_listed(wcycle, idef, threading_.get(), x, forceOutputs, fr, pbc, enerd, nrnb, lambda, md, fcdata, global_atom_index, stepWork); @@ -885,7 +885,7 @@ void ListedForces::calculate(struct gmx_wallcycle* wcycle, } if (idef.ilsort != ilsortNO_FE) { - wallcycle_sub_start(wcycle, ewcsLISTED_FEP); + wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedFep); if (idef.ilsort != ilsortFE_SORTED) { gmx_incons("The bonded interactions are not sorted for free energy"); @@ -919,7 +919,7 @@ void ListedForces::calculate(struct gmx_wallcycle* wcycle, std::fill(std::begin(dvdl), std::end(dvdl), 0.0); enerd->foreignLambdaTerms.accumulate(i, enerd->foreign_term[F_EPOT], dvdlSum); } - wallcycle_sub_stop(wcycle, ewcsLISTED_FEP); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedFep); } } } diff --git a/src/gromacs/listed_forces/position_restraints.cpp b/src/gromacs/listed_forces/position_restraints.cpp index aa881af20d..165b893208 100644 --- a/src/gromacs/listed_forces/position_restraints.cpp +++ b/src/gromacs/listed_forces/position_restraints.cpp @@ -463,7 +463,7 @@ void posres_wrapper_lambda(struct gmx_wallcycle* wcycle, gmx::ArrayRef lambda, const t_forcerec* fr) { - wallcycle_sub_start_nocount(wcycle, ewcsRESTRAINTS); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::Restraints); auto& foreignTerms = enerd->foreignLambdaTerms; for (int i = 0; i < 1 + foreignTerms.numLambdas(); i++) @@ -487,7 +487,7 @@ void posres_wrapper_lambda(struct gmx_wallcycle* wcycle, fr->posres_comB); foreignTerms.accumulate(i, v, dvdl); } - wallcycle_sub_stop(wcycle, ewcsRESTRAINTS); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints); } /*! \brief Helper function that wraps calls to fbposres for diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp index 398841179c..dca3379ce8 100644 --- a/src/gromacs/mdlib/constr.cpp +++ b/src/gromacs/mdlib/constr.cpp @@ -414,7 +414,7 @@ bool Constraints::Impl::apply(bool bLog, char buf[22]; int nth; - wallcycle_start(wcycle, ewcCONSTR); + wallcycle_start(wcycle, WallCycleCounter::Constr); if (econq == ConstraintVariable::ForceDispl && !EI_ENERGY_MINIMIZATION(ir.eI)) { @@ -781,7 +781,7 @@ bool Constraints::Impl::apply(bool bLog, do_edsam(&ir, step, cr, xprime.unpaddedArrayRef(), v.unpaddedArrayRef(), box, ed); } } - wallcycle_stop(wcycle, ewcCONSTR); + wallcycle_stop(wcycle, WallCycleCounter::Constr); const bool haveVelocities = (!v.empty() || econq == ConstraintVariable::Velocities); if (haveVelocities && !cFREEZE_.empty()) diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp index 081d603dc1..ad08c862fb 100644 --- a/src/gromacs/mdlib/force.cpp +++ b/src/gromacs/mdlib/force.cpp @@ -104,7 +104,7 @@ void calculateLongRangeNonbondeds(t_forcerec* fr, const t_inputrec& ir, const t_commrec* cr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const t_mdatoms* md, gmx::ArrayRef coordinates, gmx::ForceWithVirial* forceWithVirial, @@ -141,7 +141,7 @@ void calculateLongRangeNonbondeds(t_forcerec* fr, /* Calculate the Ewald surface force and energy contributions, when necessary */ if (haveEwaldSurfaceTerm) { - wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION); + wallcycle_sub_start(wcycle, WallCycleSubCounter::EwaldCorrection); int nthreads = fr->nthread_ewc; #pragma omp parallel for num_threads(nthreads) schedule(static) @@ -184,7 +184,7 @@ void calculateLongRangeNonbondeds(t_forcerec* fr, { reduceEwaldThreadOuput(nthreads, fr->ewc_t); } - wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::EwaldCorrection); } if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0) @@ -212,7 +212,7 @@ void calculateLongRangeNonbondeds(t_forcerec* fr, */ ddBalanceRegionHandler.closeAfterForceComputationCpu(); - wallcycle_start(wcycle, ewcPMEMESH); + wallcycle_start(wcycle, WallCycleCounter::PmeMesh); status = gmx_pme_do( fr->pmedata, gmx::constArrayRefFromArray(coordinates.data(), md->homenr - fr->n_tpi), @@ -238,7 +238,7 @@ void calculateLongRangeNonbondeds(t_forcerec* fr, &ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Coul], &ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Vdw], stepWork); - wallcycle_stop(wcycle, ewcPMEMESH); + wallcycle_stop(wcycle, WallCycleCounter::PmeMesh); if (status != 0) { gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status); diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu index 80aad0ed38..471cc1d5c7 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -130,7 +130,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer baseForcePtr, completionMarker_ = completionMarker; cellInfo_.cell = cell.data(); - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); reallocateDeviceBuffer( &cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_); copyToDeviceBuffer(&cellInfo_.d_cell, @@ -140,7 +140,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer baseForcePtr, deviceStream_, GpuApiCallBehavior::Async, nullptr); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); dependencyList_.clear(); }; @@ -164,8 +164,8 @@ void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependen void GpuForceReduction::Impl::execute() { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps); if (numAtoms_ == 0) { @@ -209,8 +209,8 @@ void GpuForceReduction::Impl::execute() completionMarker_->markEvent(deviceStream_); } - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } GpuForceReduction::Impl::~Impl(){}; diff --git a/src/gromacs/mdlib/md_support.cpp b/src/gromacs/mdlib/md_support.cpp index 24ae64beec..b991293a56 100644 --- a/src/gromacs/mdlib/md_support.cpp +++ b/src/gromacs/mdlib/md_support.cpp @@ -292,7 +292,7 @@ void compute_globals(gmx_global_stat* gstat, const t_mdatoms* mdatoms, t_nrnb* nrnb, t_vcm* vcm, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_enerdata_t* enerd, tensor force_vir, tensor shake_vir, @@ -359,7 +359,7 @@ void compute_globals(gmx_global_stat* gstat, gmx::ArrayRef signalBuffer = signalCoordinator->getCommunicationBuffer(); if (PAR(cr)) { - wallcycle_start(wcycle, ewcMoveE); + wallcycle_start(wcycle, WallCycleCounter::MoveE); global_stat(*gstat, cr, enerd, @@ -372,7 +372,7 @@ void compute_globals(gmx_global_stat* gstat, signalBuffer, *bSumEkinhOld, flags); - wallcycle_stop(wcycle, ewcMoveE); + wallcycle_stop(wcycle, WallCycleCounter::MoveE); } signalCoordinator->finalizeSignals(); *bSumEkinhOld = FALSE; diff --git a/src/gromacs/mdlib/md_support.h b/src/gromacs/mdlib/md_support.h index 40e1437c13..da949e10d1 100644 --- a/src/gromacs/mdlib/md_support.h +++ b/src/gromacs/mdlib/md_support.h @@ -123,7 +123,7 @@ void compute_globals(gmx_global_stat* gstat, const t_mdatoms* mdatoms, t_nrnb* nrnb, t_vcm* vcm, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_enerdata_t* enerd, tensor force_vir, tensor shake_vir, diff --git a/src/gromacs/mdlib/mdoutf.cpp b/src/gromacs/mdlib/mdoutf.cpp index 8f8fd006a4..9769bac325 100644 --- a/src/gromacs/mdlib/mdoutf.cpp +++ b/src/gromacs/mdlib/mdoutf.cpp @@ -91,7 +91,7 @@ struct gmx_mdoutf int natoms_global; int natoms_x_compressed; const SimulationGroups* groups; /* for compressed position writing */ - gmx_wallcycle_t wcycle; + gmx_wallcycle* wcycle; rvec* f_global; gmx::IMDOutputProvider* outputProvider; const gmx::MDModulesNotifiers* mdModulesNotifiers; @@ -110,7 +110,7 @@ gmx_mdoutf_t init_mdoutf(FILE* fplog, const t_inputrec* ir, const gmx_mtop_t& top_global, const gmx_output_env_t* oenv, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const gmx::StartingBehavior startingBehavior, bool simulationsShareState, const gmx_multisim_t* ms) @@ -261,7 +261,7 @@ FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of) return of->fp_dhdl; } -gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of) +gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of) { return of->wcycle; } @@ -753,10 +753,10 @@ void mdoutf_tng_close(gmx_mdoutf_t of) { if (of->tng || of->tng_low_prec) { - wallcycle_start(of->wcycle, ewcTRAJ); + wallcycle_start(of->wcycle, WallCycleCounter::Traj); gmx_tng_close(&of->tng); gmx_tng_close(&of->tng_low_prec); - wallcycle_stop(of->wcycle, ewcTRAJ); + wallcycle_stop(of->wcycle, WallCycleCounter::Traj); } } diff --git a/src/gromacs/mdlib/mdoutf.h b/src/gromacs/mdlib/mdoutf.h index 6461f9c747..308f08cecc 100644 --- a/src/gromacs/mdlib/mdoutf.h +++ b/src/gromacs/mdlib/mdoutf.h @@ -79,7 +79,7 @@ gmx_mdoutf_t init_mdoutf(FILE* fplog, const t_inputrec* ir, const gmx_mtop_t& mtop, const gmx_output_env_t* oenv, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::StartingBehavior startingBehavior, bool simulationsShareState, const gmx_multisim_t* ms); @@ -91,7 +91,7 @@ ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of); FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of); /*! \brief Getter for wallcycle timer */ -gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of); +gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of); /*! \brief Close TNG files if they are open. * diff --git a/src/gromacs/mdlib/resethandler.cpp b/src/gromacs/mdlib/resethandler.cpp index 7e807061ac..a471a6c828 100644 --- a/src/gromacs/mdlib/resethandler.cpp +++ b/src/gromacs/mdlib/resethandler.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -80,7 +80,7 @@ ResetHandler::ResetHandler(compat::not_null signal, bool resetHalfway, real maximumHoursToRun, const MDLogger& mdlog, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_walltime_accounting_t walltime_accounting) : signal_(*signal), rankCanSetSignal_(false), @@ -144,7 +144,7 @@ bool ResetHandler::resetCountersImpl(int64_t step, t_nrnb* nrnb, const gmx_pme_t* pme, const pme_load_balancing_t* pme_loadbal, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_walltime_accounting_t walltime_accounting) { /* Reset either if signal has been passed, or if reset step has been reached */ @@ -194,14 +194,14 @@ bool ResetHandler::resetCountersImpl(int64_t step, resetGpuProfiler(); } - wallcycle_stop(wcycle, ewcRUN); + wallcycle_stop(wcycle, WallCycleCounter::Run); wallcycle_reset_all(wcycle); if (DOMAINDECOMP(cr)) { reset_dd_statistics_counters(cr->dd); } clear_nrnb(nrnb); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); walltime_accounting_reset_time(walltime_accounting, step); print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime()); diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 3da4bc541c..105245ebfd 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -193,7 +193,7 @@ static void pull_potential_wrapper(const t_commrec* cr, pull_t* pull_work, const real* lambda, double t, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { t_pbc pbc; real dvdl; @@ -201,7 +201,7 @@ static void pull_potential_wrapper(const t_commrec* cr, /* Calculate the center of mass forces, this requires communication, * which is why pull_potential is called close to other communication. */ - wallcycle_start(wcycle, ewcPULLPOT); + wallcycle_start(wcycle, WallCycleCounter::PullPot); set_pbc(&pbc, ir.pbcType, box); dvdl = 0; enerd->term[F_COM_PULL] += @@ -215,7 +215,7 @@ static void pull_potential_wrapper(const t_commrec* cr, force, &dvdl); enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl; - wallcycle_stop(wcycle, ewcPULLPOT); + wallcycle_stop(wcycle, WallCycleCounter::PullPot); } static void pme_receive_force_ener(t_forcerec* fr, @@ -224,18 +224,18 @@ static void pme_receive_force_ener(t_forcerec* fr, gmx_enerdata_t* enerd, bool useGpuPmePpComms, bool receivePmeForceToGpu, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { real e_q, e_lj, dvdl_q, dvdl_lj; float cycles_ppdpme, cycles_seppme; - cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME); + cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme); dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME); /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. */ - wallcycle_start(wcycle, ewcPP_PMEWAITRECVF); + wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF); dvdl_q = 0; dvdl_lj = 0; gmx_pme_receive_f(fr->pmePpCommGpu.get(), @@ -257,7 +257,7 @@ static void pme_receive_force_ener(t_forcerec* fr, { dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME); } - wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF); + wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF); } static void print_large_forces(FILE* fp, @@ -302,7 +302,7 @@ static void print_large_forces(FILE* fp, //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces() static void postProcessForceWithShiftForces(t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const matrix box, ArrayRef x, ForceOutputs* forceOutputs, @@ -342,7 +342,7 @@ static void postProcessForceWithShiftForces(t_nrnb* nrnb, static void postProcessForces(const t_commrec* cr, int64_t step, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const matrix box, ArrayRef x, ForceOutputs* forceOutputs, @@ -417,7 +417,7 @@ static void do_nb_verlet(t_forcerec* fr, const int clearF, const int64_t step, t_nrnb* nrnb, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { if (!stepWork.computeNonbondedForces) { @@ -438,9 +438,9 @@ static void do_nb_verlet(t_forcerec* fr, /* Prune the pair-list beyond fr->ic->rlistPrune using * the current coordinates of the atoms. */ - wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning); nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec); - wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning); } } @@ -624,7 +624,7 @@ static void computeSpecialForces(FILE* fplog, pull_t* pull_work, int64_t step, double t, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::ForceProviders* forceProviders, const matrix box, gmx::ArrayRef x, @@ -697,10 +697,10 @@ static void computeSpecialForces(FILE* fplog, /* Add the forces from enforced rotation potentials (if any) */ if (inputrec.bRot) { - wallcycle_start(wcycle, ewcROTadd); + wallcycle_start(wcycle, WallCycleCounter::RotAdd); enerd->term[F_COM_PULL] += add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t); - wallcycle_stop(wcycle, ewcROTadd); + wallcycle_stop(wcycle, WallCycleCounter::RotAdd); } if (ed) @@ -734,7 +734,7 @@ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata, const StepWorkload& stepWork, GpuEventSynchronizer* xReadyOnDevice, const real lambdaQ, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork); pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ); @@ -751,7 +751,7 @@ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata, */ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, const real lambdaQ, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork) { pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork); @@ -783,7 +783,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv, gmx_enerdata_t* enerd, const real lambdaQ, const StepWorkload& stepWork, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { bool isPmeGpuDone = false; bool isNbGpuDone = false; @@ -839,9 +839,9 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceH const DomainLifetimeWorkload& domainWork, const StepWorkload& stepWork, const bool havePpDomainDecomposition, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { - wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER); + wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer); /* NOTE: We assume fr->shiftForces is all zeros here */ gmx::ForceWithShiftForces forceWithShiftForces( @@ -882,7 +882,7 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceH clearRVecs(forceWithVirial.force_, true); } - wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer); return ForceOutputs( forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial); @@ -992,7 +992,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv, const gmx::MdrunScheduleWorkload& runScheduleWork, bool useGpuPmeOnThisRank, int64_t step, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces) { @@ -1006,11 +1006,11 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv, } /* now clear the GPU outputs while we finish the step on the CPU */ - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } if (useGpuPmeOnThisRank) @@ -1195,7 +1195,7 @@ void do_force(FILE* fplog, pull_t* pull_work, int64_t step, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const gmx_localtop_t* top, const matrix box, gmx::ArrayRefWithPadding x, @@ -1374,12 +1374,12 @@ void do_force(FILE* fplog, fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box); } - wallcycle_start(wcycle, ewcNS); + wallcycle_start(wcycle, WallCycleCounter::NS); if (!DOMAINDECOMP(cr)) { const rvec vzero = { 0.0_real, 0.0_real, 0.0_real }; const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] }; - wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal); nbnxn_put_on_grid(nbv, box, 0, @@ -1392,30 +1392,30 @@ void do_force(FILE* fplog, x.unpaddedArrayRef(), 0, nullptr); - wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal); } else { - wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal); nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef()); - wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal); } nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr), gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr), fr->cginfo); - wallcycle_stop(wcycle, ewcNS); + wallcycle_stop(wcycle, WallCycleCounter::NS); /* initialize the GPU nbnxm atom data and bonded data structures */ if (simulationWork.useGpuNonbonded) { // Note: cycle counting only nononbondeds, gpuBonded counts internally - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get()); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); if (fr->gpuBonded) { @@ -1440,15 +1440,15 @@ void do_force(FILE* fplog, runScheduleWork->domainWork = setupDomainLifetimeWorkload( inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork); - wallcycle_start_nocount(wcycle, ewcNS); - wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL); + wallcycle_start_nocount(wcycle, WallCycleCounter::NS); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal); /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb); nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local); - wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL); - wallcycle_stop(wcycle, ewcNS); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal); + wallcycle_stop(wcycle, WallCycleCounter::NS); if (stepWork.useGpuXBufferOps) { @@ -1484,15 +1484,15 @@ void do_force(FILE* fplog, { ddBalanceRegionHandler.openBeforeForceComputationGpu(); - wallcycle_start(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get()); if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps) { Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local); } - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); // with X buffer ops offloaded to the GPU on all but the search steps // bonded work not split into separate local and non-local, so with DD @@ -1503,11 +1503,11 @@ void do_force(FILE* fplog, } /* launch local nonbonded work on GPU */ - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } if (useGpuPmeOnThisRank) @@ -1529,14 +1529,14 @@ void do_force(FILE* fplog, if (stepWork.doNeighborSearch) { // TODO: fuse this branch with the above large stepWork.doNeighborSearch block - wallcycle_start_nocount(wcycle, ewcNS); - wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL); + wallcycle_start_nocount(wcycle, WallCycleCounter::NS); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal); /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb); nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal); - wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); - wallcycle_stop(wcycle, ewcNS); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal); + wallcycle_stop(wcycle, WallCycleCounter::NS); // TODO refactor this GPU halo exchange re-initialisation // to location in do_md where GPU halo exchange is // constructed at partitioning, after above stateGpu @@ -1593,11 +1593,11 @@ void do_force(FILE* fplog, if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps) { - wallcycle_start(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } if (domainWork.haveGpuBondedWork) @@ -1606,32 +1606,32 @@ void do_force(FILE* fplog, } /* launch non-local nonbonded tasks on GPU */ - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } } if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces) { /* launch D2H copy-back F */ - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); if (havePPDomainDecomposition(cr)) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal); } Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded); if (domainWork.haveGpuBondedWork && stepWork.computeEnergy) { fr->gpuBonded->launchEnergyTransfer(); } - wallcycle_stop(wcycle, ewcLAUNCH_GPU); + wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } gmx::ArrayRef xWholeMolecules; @@ -1676,7 +1676,7 @@ void do_force(FILE* fplog, if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME)) { - wallcycle_start(wcycle, ewcPPDURINGPME); + wallcycle_start(wcycle, WallCycleCounter::PpDuringPme); dd_force_flop_start(cr->dd, nrnb); } @@ -1691,15 +1691,15 @@ void do_force(FILE* fplog, if (inputrec.bRot) { - wallcycle_start(wcycle, ewcROT); + wallcycle_start(wcycle, WallCycleCounter::Rot); do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch); - wallcycle_stop(wcycle, ewcROT); + wallcycle_stop(wcycle, WallCycleCounter::Rot); } /* Start the force cycle counter. * Note that a different counter is used for dynamic load balancing. */ - wallcycle_start(wcycle, ewcFORCE); + wallcycle_start(wcycle, WallCycleCounter::Force); /* Set up and clear force outputs: * forceOutMtsLevel0: everything except what is in the other two outputs @@ -1799,10 +1799,10 @@ void do_force(FILE* fplog, * This can be split into a local and a non-local part when overlapping * communication with calculation with domain decomposition. */ - wallcycle_stop(wcycle, ewcFORCE); + wallcycle_stop(wcycle, WallCycleCounter::Force); nbv->atomdata_add_nbat_f_to_f(AtomLocality::All, forceOutNonbonded->forceWithShiftForces().force()); - wallcycle_start_nocount(wcycle, ewcFORCE); + wallcycle_start_nocount(wcycle, WallCycleCounter::Force); } /* If there are multiple fshift output buffers we need to reduce them */ @@ -1818,10 +1818,10 @@ void do_force(FILE* fplog, // TODO Force flags should include haveFreeEnergyWork for this domain if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)) { - wallcycle_stop(wcycle, ewcFORCE); + wallcycle_stop(wcycle, WallCycleCounter::Force); /* Wait for non-local coordinate data to be copied from device */ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal); - wallcycle_start_nocount(wcycle, ewcFORCE); + wallcycle_start_nocount(wcycle, WallCycleCounter::Force); } // Compute wall interactions, when present. @@ -1906,7 +1906,7 @@ void do_force(FILE* fplog, ddBalanceRegionHandler); } - wallcycle_stop(wcycle, ewcFORCE); + wallcycle_stop(wcycle, WallCycleCounter::Force); // VdW dispersion correction, only computed on master rank to avoid double counting if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr)) @@ -1983,10 +1983,10 @@ void do_force(FILE* fplog, } else { - wallcycle_start_nocount(wcycle, ewcFORCE); + wallcycle_start_nocount(wcycle, WallCycleCounter::Force); do_nb_verlet( fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle); - wallcycle_stop(wcycle, ewcFORCE); + wallcycle_stop(wcycle, WallCycleCounter::Force); } if (stepWork.useGpuFBufferOps) @@ -2152,7 +2152,7 @@ void do_force(FILE* fplog, { // NOTE: emulation kernel is not included in the balancing region, // but emulation mode does not target performance anyway - wallcycle_start_nocount(wcycle, ewcFORCE); + wallcycle_start_nocount(wcycle, WallCycleCounter::Force); do_nb_verlet(fr, ic, enerd, @@ -2162,7 +2162,7 @@ void do_force(FILE* fplog, step, nrnb, wcycle); - wallcycle_stop(wcycle, ewcFORCE); + wallcycle_stop(wcycle, WallCycleCounter::Force); } // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops diff --git a/src/gromacs/mdlib/trajectory_writing.cpp b/src/gromacs/mdlib/trajectory_writing.cpp index a7787ea97d..53f1e7ff6d 100644 --- a/src/gromacs/mdlib/trajectory_writing.cpp +++ b/src/gromacs/mdlib/trajectory_writing.cpp @@ -121,7 +121,7 @@ void do_md_trajectory_writing(FILE* fplog, if (mdof_flags != 0) { - wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); if (bCPT) { if (MASTER(cr)) @@ -191,7 +191,7 @@ void do_md_trajectory_writing(FILE* fplog, sfree(x_for_confout); } } - wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); } #if GMX_FAHCORE if (MASTER(cr)) diff --git a/src/gromacs/mdlib/update.cpp b/src/gromacs/mdlib/update.cpp index 14f03826ed..a2876a3123 100644 --- a/src/gromacs/mdlib/update.cpp +++ b/src/gromacs/mdlib/update.cpp @@ -139,7 +139,7 @@ public: void finish_update(const t_inputrec& inputRecord, const t_mdatoms* md, t_state* state, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, bool haveConstraints); void update_sd_second_half(const t_inputrec& inputRecord, @@ -151,7 +151,7 @@ public: t_state* state, const t_commrec* cr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::Constraints* constr, bool do_log, bool do_ene); @@ -246,7 +246,7 @@ void Update::update_coords(const t_inputrec& inpu void Update::finish_update(const t_inputrec& inputRecord, const t_mdatoms* md, t_state* state, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const bool haveConstraints) { return impl_->finish_update(inputRecord, md, state, wcycle, haveConstraints); @@ -259,7 +259,7 @@ void Update::update_sd_second_half(const t_inputrec& inputRecord, t_state* state, const t_commrec* cr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::Constraints* constr, bool do_log, bool do_ene) @@ -1394,7 +1394,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec& input t_state* state, const t_commrec* cr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::Constraints* constr, bool do_log, bool do_ene) @@ -1415,7 +1415,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec& input */ real dt = inputRecord.delta_t; - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); int nth = gmx_omp_nthreads_get(emntUpdate); @@ -1448,7 +1448,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec& input GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR } inc_nrnb(nrnb, eNR_UPDATE, homenr); - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); /* Constrain the coordinates upd->xp for half a time step */ bool computeVirial = false; @@ -1473,14 +1473,14 @@ void Update::Impl::update_sd_second_half(const t_inputrec& input void Update::Impl::finish_update(const t_inputrec& inputRecord, const t_mdatoms* md, t_state* state, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const bool haveConstraints) { /* NOTE: Currently we always integrate to a temporary buffer and * then copy the results back here. */ - wallcycle_start_nocount(wcycle, ewcUPDATE); + wallcycle_start_nocount(wcycle, WallCycleCounter::Update); const int homenr = md->homenr; auto xp = makeConstArrayRef(xp_).subArray(0, homenr); @@ -1521,7 +1521,7 @@ void Update::Impl::finish_update(const t_inputrec& inputRecord, } } - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); } void Update::Impl::update_coords(const t_inputrec& inputRecord, diff --git a/src/gromacs/mdlib/update.h b/src/gromacs/mdlib/update.h index 10a93faa51..5992a69e45 100644 --- a/src/gromacs/mdlib/update.h +++ b/src/gromacs/mdlib/update.h @@ -144,7 +144,7 @@ public: void finish_update(const t_inputrec& inputRecord, const t_mdatoms* md, t_state* state, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, bool haveConstraints); /*! \brief Secong part of the SD integrator. @@ -172,7 +172,7 @@ public: t_state* state, const t_commrec* cr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx::Constraints* constr, bool do_log, bool do_ene); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index 5fc4fb8609..5a950d34e5 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -109,8 +109,8 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead const float dtPressureCouple, const matrix prVelocityScalingMatrix) { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); // Clearing virial matrix // TODO There is no point in having separate virial matrix for constraints @@ -141,16 +141,16 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead coordinatesReady_->markEvent(deviceStream_); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); return; } void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); ScalingMatrix mu(scalingMatrix); @@ -166,14 +166,14 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) // can affect the performance if nstpcouple is small. deviceStream_.synchronize(); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); ScalingMatrix mu(scalingMatrix); @@ -189,8 +189,8 @@ void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) // can affect the performance if nstpcouple is small. deviceStream_.synchronize(); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, @@ -227,8 +227,8 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, const t_mdatoms& md) { // TODO wallcycle - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null."); GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); @@ -253,8 +253,8 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, coordinateScalingKernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock; - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box) diff --git a/src/gromacs/mdlib/update_vv.cpp b/src/gromacs/mdlib/update_vv.cpp index 76334de0c5..0db7c24791 100644 --- a/src/gromacs/mdlib/update_vv.cpp +++ b/src/gromacs/mdlib/update_vv.cpp @@ -119,7 +119,7 @@ void integrateVVFirstStep(int64_t step, /* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */ rvec* vbuf = nullptr; - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); if (ir->eI == IntegrationAlgorithm::VV && bInitStep) { /* if using velocity verlet with full time step Ekin, @@ -140,9 +140,9 @@ void integrateVVFirstStep(int64_t step, upd->update_coords( *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtVELOCITY1, cr, constr != nullptr); - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir); - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); /* if VV, compute the pressure and constraints */ /* For VV2, we strictly only need this if using pressure * control, but we really would like to have accurate pressures @@ -163,7 +163,7 @@ void integrateVVFirstStep(int64_t step, So we need information from the last step in the first half of the integration */ if (bGStat || do_per_step(step - 1, nstglobalcomm)) { - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); int cglo_flags = ((bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0) | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0) @@ -212,7 +212,7 @@ void integrateVVFirstStep(int64_t step, fplog, vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v)); inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr); } - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); } /* temperature scaling and pressure scaling to produce the extended variables at t+dt */ if (!bInitStep) @@ -241,7 +241,7 @@ void integrateVVFirstStep(int64_t step, } else if (bExchanged) { - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); /* We need the kinetic energy at minus the half step for determining * the full step kinetic energy and possibly for T-coupling.*/ /* This may not be quite working correctly yet . . . . */ @@ -267,7 +267,7 @@ void integrateVVFirstStep(int64_t step, state->box, bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE); - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); } } /* if it's the initial step, we performed this first step just to get the constraint virial */ @@ -276,7 +276,7 @@ void integrateVVFirstStep(int64_t step, copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms); sfree(vbuf); } - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); } /* compute the conserved quantity */ @@ -355,7 +355,7 @@ void integrateVVSecondStep(int64_t step, upd->update_coords( *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr); - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); constrain_coordinates( constr, do_log, do_ene, step, state, upd->xp()->arrayRefWithPadding(), dvdl_constr, bCalcVir, shake_vir); @@ -390,14 +390,14 @@ void integrateVVSecondStep(int64_t step, lastbox, bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE); - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, MassQ, trotter_seq, ettTSEQ4); /* now we know the scaling, we can compute the positions again */ std::copy(cbuf->begin(), cbuf->end(), state->x.begin()); upd->update_coords( *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr); - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */ /* are the small terms in the shake_vir here due diff --git a/src/gromacs/mdlib/vsite.cpp b/src/gromacs/mdlib/vsite.cpp index 6dd3c6f104..9be178c243 100644 --- a/src/gromacs/mdlib/vsite.cpp +++ b/src/gromacs/mdlib/vsite.cpp @@ -2283,7 +2283,7 @@ void VirtualSitesHandler::Impl::spreadForces(ArrayRef x, const matrix box, gmx_wallcycle* wcycle) { - wallcycle_start(wcycle, ewcVSITESPREAD); + wallcycle_start(wcycle, WallCycleCounter::VsiteSpread); const bool useDomdec = domainInfo_.useDomdec(); @@ -2477,7 +2477,7 @@ void VirtualSitesHandler::Impl::spreadForces(ArrayRef x, inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(ilists_, F_VSITE4FDN)); inc_nrnb(nrnb, eNR_VSITEN, vsite_count(ilists_, F_VSITEN)); - wallcycle_stop(wcycle, ewcVSITESPREAD); + wallcycle_stop(wcycle, WallCycleCounter::VsiteSpread); } /*! \brief Returns the an array with group indices for each atom diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index e398e7693e..ecb0778652 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -796,7 +796,7 @@ void gmx::LegacySimulator::do_md() } walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, "mdrun"); /*********************************************************** @@ -890,7 +890,7 @@ void gmx::LegacySimulator::do_md() simulationWork.useGpuPmePpCommunication); } - wallcycle_start(wcycle, ewcSTEP); + wallcycle_start(wcycle, WallCycleCounter::Step); bLastStep = (step_rel == ir->nsteps); t = t0 + step * ir->delta_t; @@ -962,7 +962,7 @@ void gmx::LegacySimulator::do_md() if (vsite != nullptr) { // Virtual sites need to be updated before domain decomposition and forces are calculated - wallcycle_start(wcycle, ewcVSITECONSTR); + wallcycle_start(wcycle, WallCycleCounter::VsiteConstr); // md-vv calculates virtual velocities once it has full-step real velocities vsite->construct(state->x, state->v, @@ -970,7 +970,7 @@ void gmx::LegacySimulator::do_md() (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep) ? VSiteOperation::PositionsAndVelocities : VSiteOperation::Positions); - wallcycle_stop(wcycle, ewcVSITECONSTR); + wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr); } if (bNS && !(bFirstStep && ir->bContinuation)) @@ -1264,9 +1264,9 @@ void gmx::LegacySimulator::do_md() if (vsite != nullptr && needVirtualVelocitiesThisStep) { // Positions were calculated earlier - wallcycle_start(wcycle, ewcVSITECONSTR); + wallcycle_start(wcycle, WallCycleCounter::VsiteConstr); vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities); - wallcycle_stop(wcycle, ewcVSITECONSTR); + wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr); } } @@ -1407,7 +1407,7 @@ void gmx::LegacySimulator::do_md() if (!useGpuForUpdate) { - wallcycle_start(wcycle, ewcUPDATE); + wallcycle_start(wcycle, WallCycleCounter::Update); } /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */ if (bTrotter) @@ -1564,7 +1564,7 @@ void gmx::LegacySimulator::do_md() upd.update_coords( *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr); - wallcycle_stop(wcycle, ewcUPDATE); + wallcycle_stop(wcycle, WallCycleCounter::Update); constrain_coordinates(constr, do_log, @@ -1934,7 +1934,7 @@ void gmx::LegacySimulator::do_md() rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data())); } - cycles = wallcycle_stop(wcycle, ewcSTEP); + cycles = wallcycle_stop(wcycle, WallCycleCounter::Step); if (DOMAINDECOMP(cr) && wcycle) { dd_cycles_add(cr->dd, cycles, ddCyclStep); diff --git a/src/gromacs/mdrun/mimic.cpp b/src/gromacs/mdrun/mimic.cpp index d5403f1945..25feef03cd 100644 --- a/src/gromacs/mdrun/mimic.cpp +++ b/src/gromacs/mdrun/mimic.cpp @@ -387,7 +387,7 @@ void gmx::LegacySimulator::do_mimic() } walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, "mdrun"); /*********************************************************** @@ -438,7 +438,7 @@ void gmx::LegacySimulator::do_mimic() while (!isLastStep) { isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel == ir->nsteps)); - wallcycle_start(wcycle, ewcSTEP); + wallcycle_start(wcycle, WallCycleCounter::Step); t = step; @@ -464,9 +464,9 @@ void gmx::LegacySimulator::do_mimic() } if (constructVsites) { - wallcycle_start(wcycle, ewcVSITECONSTR); + wallcycle_start(wcycle, WallCycleCounter::VsiteConstr); vsite->construct(state->x, state->v, state->box, VSiteOperation::PositionsAndVelocities); - wallcycle_stop(wcycle, ewcVSITECONSTR); + wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr); } } @@ -756,7 +756,7 @@ void gmx::LegacySimulator::do_mimic() print_time(stderr, walltime_accounting, step, ir, cr); } - cycles = wallcycle_stop(wcycle, ewcSTEP); + cycles = wallcycle_stop(wcycle, WallCycleCounter::Step); if (DOMAINDECOMP(cr) && wcycle) { dd_cycles_add(cr->dd, cycles, ddCyclStep); diff --git a/src/gromacs/mdrun/minimize.cpp b/src/gromacs/mdrun/minimize.cpp index e564223b81..2725353d5c 100644 --- a/src/gromacs/mdrun/minimize.cpp +++ b/src/gromacs/mdrun/minimize.cpp @@ -140,18 +140,18 @@ typedef struct em_state static void print_em_start(FILE* fplog, const t_commrec* cr, gmx_walltime_accounting_t walltime_accounting, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, const char* name) { walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, name); } //! Stop counting time for EM -static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle_t wcycle) +static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle* wcycle) { - wallcycle_stop(wcycle, ewcRUN); + wallcycle_stop(wcycle, WallCycleCounter::Run); walltime_accounting_end_time(walltime_accounting); } @@ -516,7 +516,7 @@ static void init_em(FILE* fplog, static void finish_em(const t_commrec* cr, gmx_mdoutf_t outf, gmx_walltime_accounting_t walltime_accounting, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { if (!thisRankHasDuty(cr, DUTY_PME)) { @@ -793,7 +793,7 @@ static void em_dd_partition_system(FILE* fplog, VirtualSitesHandler* vsite, gmx::Constraints* constr, t_nrnb* nrnb, - gmx_wallcycle_t wcycle) + gmx_wallcycle* wcycle) { /* Repartition the domain decomposition */ dd_partition_system(fplog, @@ -915,7 +915,7 @@ public: //! Manages flop accounting. t_nrnb* nrnb; //! Manages wall cycle accounting. - gmx_wallcycle_t wcycle; + gmx_wallcycle* wcycle; //! Coordinates global reduction. gmx_global_stat_t gstat; //! Handles virtual sites. @@ -1026,7 +1026,7 @@ void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, /* Communicate stuff when parallel */ if (PAR(cr) && inputrec->eI != IntegrationAlgorithm::NM) { - wallcycle_start(wcycle, ewcMoveE); + wallcycle_start(wcycle, WallCycleCounter::MoveE); global_stat(*gstat, cr, @@ -1041,7 +1041,7 @@ void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres, FALSE, CGLO_ENERGY | CGLO_PRESSURE | CGLO_CONSTRAINT); - wallcycle_stop(wcycle, ewcMoveE); + wallcycle_stop(wcycle, WallCycleCounter::MoveE); } if (fr->dispersionCorrection) diff --git a/src/gromacs/mdrun/rerun.cpp b/src/gromacs/mdrun/rerun.cpp index e1234ade76..157d18e55b 100644 --- a/src/gromacs/mdrun/rerun.cpp +++ b/src/gromacs/mdrun/rerun.cpp @@ -431,7 +431,7 @@ void gmx::LegacySimulator::do_rerun() } walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, "mdrun"); /*********************************************************** @@ -528,7 +528,7 @@ void gmx::LegacySimulator::do_rerun() isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps)); while (!isLastStep) { - wallcycle_start(wcycle, ewcSTEP); + wallcycle_start(wcycle, WallCycleCounter::Step); if (rerun_fr.bStep) { @@ -863,7 +863,7 @@ void gmx::LegacySimulator::do_rerun() rerun_parallel_comm(cr, &rerun_fr, &isLastStep); } - cycles = wallcycle_stop(wcycle, ewcSTEP); + cycles = wallcycle_stop(wcycle, WallCycleCounter::Step); if (DOMAINDECOMP(cr) && wcycle) { dd_cycles_add(cr->dd, cycles, ddCyclStep); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 33f9145889..45fbace426 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -653,7 +653,7 @@ static void finish_run(FILE* fplog, const t_commrec* cr, const t_inputrec& inputrec, t_nrnb nrnb[], - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_walltime_accounting_t walltime_accounting, nonbonded_verlet_t* nbv, const gmx_pme_t* pme, @@ -803,7 +803,6 @@ int Mdrunner::mdrunner() real ewaldcoeff_q = 0; real ewaldcoeff_lj = 0; int nChargePerturbed = -1, nTypePerturbed = 0; - gmx_wallcycle_t wcycle; gmx_walltime_accounting_t walltime_accounting = nullptr; MembedHolder membedHolder(filenames.size(), filenames.data()); @@ -1598,15 +1597,16 @@ int Mdrunner::mdrunner() "The -resetstep functionality is deprecated, and may be removed in a " "future version."); } - wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr); + std::unique_ptr wcycle = + wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr); if (PAR(cr)) { /* Master synchronizes its value of reset_counters with all nodes * including PME only nodes */ - int64_t reset_counters = wcycle_get_reset_counters(wcycle); + int64_t reset_counters = wcycle_get_reset_counters(wcycle.get()); gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim); - wcycle_set_reset_counters(wcycle, reset_counters); + wcycle_set_reset_counters(wcycle.get(), reset_counters); } // Membrane embedding must be initialized before we call init_forcerec() @@ -1680,7 +1680,7 @@ int Mdrunner::mdrunner() deviceStreamManager.get(), mtop, box, - wcycle); + wcycle.get()); // TODO: Move the logic below to a GPU bonded builder if (runScheduleWork.simulationWork.useGpuBonded) { @@ -1692,7 +1692,7 @@ int Mdrunner::mdrunner() fr->ic->epsfac * fr->fudgeQQ, deviceStreamManager->context(), deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), - wcycle); + wcycle.get()); fr->gpuBonded = gpuBonded.get(); } @@ -1910,7 +1910,7 @@ int Mdrunner::mdrunner() /* Let makeConstraints know whether we have essential dynamics constraints. */ auto constr = makeConstraints( - mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle, fr->bMolPBC); + mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle.get(), fr->bMolPBC); /* Energy terms and groups */ gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(), @@ -1927,7 +1927,7 @@ int Mdrunner::mdrunner() /* Set up interactive MD (IMD) */ auto imdSession = makeImdSession(inputrec.get(), cr, - wcycle, + wcycle.get(), &enerd, ms, mtop, @@ -1960,11 +1960,11 @@ int Mdrunner::mdrunner() fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique( deviceStreamManager->context(), deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), - wcycle); + wcycle.get()); fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique( deviceStreamManager->context(), deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), - wcycle); + wcycle.get()); } std::unique_ptr stateGpu; @@ -1979,7 +1979,7 @@ int Mdrunner::mdrunner() GMX_RELEASE_ASSERT(deviceStreamManager != nullptr, "GPU device stream manager should be initialized to use GPU."); stateGpu = std::make_unique( - *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle); + *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get()); fr->stateGpu = stateGpu.get(); } @@ -1993,7 +1993,7 @@ int Mdrunner::mdrunner() simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv)); - simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle)); + simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get())); simulatorBuilder.add(ConstraintsParam( constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get())); // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly. @@ -2033,14 +2033,14 @@ int Mdrunner::mdrunner() gmx_pmeonly(pmedata, cr, &nrnb, - wcycle, + wcycle.get(), walltime_accounting, inputrec.get(), pmeRunMode, deviceStreamManager.get()); } - wallcycle_stop(wcycle, ewcRUN); + wallcycle_stop(wcycle.get(), WallCycleCounter::Run); /* Finish up, write some stuff * if rerunMD, don't write last frame again @@ -2050,14 +2050,12 @@ int Mdrunner::mdrunner() cr, *inputrec, &nrnb, - wcycle, + wcycle.get(), walltime_accounting, fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms)); - // clean up cycle counter - wallcycle_destroy(wcycle); deviceStreamManager.reset(nullptr); // Free PME data diff --git a/src/gromacs/mdrun/shellfc.cpp b/src/gromacs/mdrun/shellfc.cpp index 0b630b6a52..af47d79566 100644 --- a/src/gromacs/mdrun/shellfc.cpp +++ b/src/gromacs/mdrun/shellfc.cpp @@ -957,7 +957,7 @@ void relax_shell_flexcon(FILE* fplog, tensor force_vir, const t_mdatoms& md, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_shellfc_t* shfc, t_forcerec* fr, gmx::MdrunScheduleWorkload* runScheduleWork, diff --git a/src/gromacs/mdrun/shellfc.h b/src/gromacs/mdrun/shellfc.h index 6d3120fd98..b8a314679c 100644 --- a/src/gromacs/mdrun/shellfc.h +++ b/src/gromacs/mdrun/shellfc.h @@ -116,7 +116,7 @@ void relax_shell_flexcon(FILE* log, tensor force_vir, const t_mdatoms& md, t_nrnb* nrnb, - gmx_wallcycle_t wcycle, + gmx_wallcycle* wcycle, gmx_shellfc_t* shfc, t_forcerec* fr, gmx::MdrunScheduleWorkload* runScheduleWork, diff --git a/src/gromacs/mdrun/tpi.cpp b/src/gromacs/mdrun/tpi.cpp index 056cbb29d4..758e2ae691 100644 --- a/src/gromacs/mdrun/tpi.cpp +++ b/src/gromacs/mdrun/tpi.cpp @@ -305,7 +305,7 @@ void LegacySimulator::do_tpi() /* Print to log file */ walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, "Test Particle Insertion"); /* The last charge group is the group to be inserted */ diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 16fbe131f8..69e11d69c0 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -136,8 +136,8 @@ StatePropagatorDataGpu::Impl::~Impl() {} void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); numAtomsLocal_ = numAtomsLocal; numAtomsAll_ = numAtomsAll; @@ -174,8 +174,8 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_); } - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) @@ -316,8 +316,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_ GMX_ASSERT(deviceStream != nullptr, "No stream is valid for copying forces with given atom locality."); - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream); fReadyOnHost_[atomLocality].markEvent(*deviceStream); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality) { - wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA); + wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData); fReadyOnHost_[atomLocality].waitForEvent(); - wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA); + wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData); } const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream() diff --git a/src/gromacs/modularsimulator/propagator.cpp b/src/gromacs/modularsimulator/propagator.cpp index 7a6074d0cf..09edcd0890 100644 --- a/src/gromacs/modularsimulator/propagator.cpp +++ b/src/gromacs/modularsimulator/propagator.cpp @@ -154,7 +154,7 @@ template void Propagator::run() { - wallcycle_start(wcycle_, ewcUPDATE); + wallcycle_start(wcycle_, WallCycleCounter::Update); auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data()); auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data()); @@ -178,7 +178,7 @@ void Propagator::run() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR } - wallcycle_stop(wcycle_, ewcUPDATE); + wallcycle_stop(wcycle_, WallCycleCounter::Update); } //! Propagation (velocity only) @@ -188,7 +188,7 @@ template void Propagator::run() { - wallcycle_start(wcycle_, ewcUPDATE); + wallcycle_start(wcycle_, WallCycleCounter::Update); auto v = as_rvec_array(statePropagatorData_->velocitiesView().paddedArrayRef().data()); auto f = as_rvec_array(statePropagatorData_->constForcesView().force().data()); @@ -258,7 +258,7 @@ void Propagator::run() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR } - wallcycle_stop(wcycle_, ewcUPDATE); + wallcycle_stop(wcycle_, WallCycleCounter::Update); } //! Propagation (leapfrog case - position and velocity) @@ -268,7 +268,7 @@ template void Propagator::run() { - wallcycle_start(wcycle_, ewcUPDATE); + wallcycle_start(wcycle_, WallCycleCounter::Update); auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data()); auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data()); @@ -342,7 +342,7 @@ void Propagator::run() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR } - wallcycle_stop(wcycle_, ewcUPDATE); + wallcycle_stop(wcycle_, WallCycleCounter::Update); } //! Propagation (velocity verlet stage 2 - velocity and position) @@ -352,7 +352,7 @@ template void Propagator::run() { - wallcycle_start(wcycle_, ewcUPDATE); + wallcycle_start(wcycle_, WallCycleCounter::Update); auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data()); auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data()); @@ -426,7 +426,7 @@ void Propagator::run() } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR } - wallcycle_stop(wcycle_, ewcUPDATE); + wallcycle_stop(wcycle_, WallCycleCounter::Update); } template diff --git a/src/gromacs/modularsimulator/simulatoralgorithm.cpp b/src/gromacs/modularsimulator/simulatoralgorithm.cpp index afce03938b..ff20ef7cab 100644 --- a/src/gromacs/modularsimulator/simulatoralgorithm.cpp +++ b/src/gromacs/modularsimulator/simulatoralgorithm.cpp @@ -228,7 +228,7 @@ void ModularSimulatorAlgorithm::simulatorSetup() } walltime_accounting_start_time(walltime_accounting); - wallcycle_start(wcycle, ewcRUN); + wallcycle_start(wcycle, WallCycleCounter::Run); print_start(fplog, cr, walltime_accounting, "mdrun"); step_ = inputrec->init_step; @@ -275,7 +275,7 @@ void ModularSimulatorAlgorithm::preStep(Step step, Time gmx_unused time, bool is stophandlerCurrentStep_ = step; stopHandler_->setSignal(); - wallcycle_start(wcycle, ewcSTEP); + wallcycle_start(wcycle, WallCycleCounter::Step); } void ModularSimulatorAlgorithm::postStep(Step step, Time gmx_unused time) @@ -301,7 +301,7 @@ void ModularSimulatorAlgorithm::postStep(Step step, Time gmx_unused time) print_time(stderr, walltime_accounting, step, inputrec, cr); } - double cycles = wallcycle_stop(wcycle, ewcSTEP); + double cycles = wallcycle_stop(wcycle, WallCycleCounter::Step); if (DOMAINDECOMP(cr) && wcycle) { dd_cycles_add(cr->dd, static_cast(cycles), ddCyclStep); diff --git a/src/gromacs/modularsimulator/statepropagatordata.cpp b/src/gromacs/modularsimulator/statepropagatordata.cpp index 1cf9df615e..6c1d1a6ad9 100644 --- a/src/gromacs/modularsimulator/statepropagatordata.cpp +++ b/src/gromacs/modularsimulator/statepropagatordata.cpp @@ -393,7 +393,7 @@ StatePropagatorData::Element::registerTrajectoryWriterCallback(TrajectoryEvent e void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Time currentTime) { - wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); unsigned int mdof_flags = 0; if (do_per_step(currentStep, nstxout_)) { @@ -430,7 +430,7 @@ void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Ti if (mdof_flags == 0) { - wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); return; } GMX_ASSERT(localStateBackup_, "Trajectory writing called, but no state saved."); @@ -455,7 +455,7 @@ void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Ti { localStateBackup_.reset(); } - wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); } void StatePropagatorData::Element::elementSetup() @@ -619,7 +619,7 @@ void StatePropagatorData::Element::trajectoryWriterTeardown(gmx_mdoutf* gmx_unus GMX_ASSERT(localStateBackup_, "Final trajectory writing called, but no state saved."); - wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); if (DOMAINDECOMP(cr_)) { auto globalXRef = @@ -664,7 +664,7 @@ void StatePropagatorData::Element::trajectoryWriterTeardown(gmx_mdoutf* gmx_unus pbcType_, localStateBackup_->box); } - wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ); + wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj); } std::optional StatePropagatorData::Element::registerLastStepCallback() diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index cdc35d093a..bf9c25a5e7 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -286,18 +286,18 @@ bool gpu_try_finish_task(NbnxmGpu* nb, // we start without counting and only when the task finished we issue a // start/stop to increment. // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller. - wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L); + wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL); if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality])) { - wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); + wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL); // Early return to skip the steps below that we have to do only // after the NB task completed return false; } - wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L); + wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL); } else if (haveResultToWaitFor) { @@ -361,8 +361,8 @@ float gpu_wait_finish_task(NbnxmGpu* nb, gmx_wallcycle* wcycle) { auto cycleCounter = (atomToInteractionLocality(aloc) == InteractionLocality::Local) - ? ewcWAIT_GPU_NB_L - : ewcWAIT_GPU_NB_NL; + ? WallCycleCounter::WaitGpuNbL + : WallCycleCounter::WaitGpuNbNL; wallcycle_start(wcycle, cycleCounter); gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle); diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index 846e2bbdb3..93e28f8487 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -259,7 +259,7 @@ static void nbnxn_kernel_cpu(const PairlistSet& pairlistSet, auto shiftVecPointer = as_rvec_array(shiftVectors.data()); int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded); - wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedClear); #pragma omp parallel for schedule(static) num_threads(nthreads) for (gmx::index nb = 0; nb < pairlists.ssize(); nb++) { @@ -276,8 +276,8 @@ static void nbnxn_kernel_cpu(const PairlistSet& pairlistSet, if (nb == 0) { - wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR); - wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedClear); + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedKernel); } // TODO: Change to reference @@ -375,7 +375,7 @@ static void nbnxn_kernel_cpu(const PairlistSet& pairlistSet, } } } - wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedKernel); if (stepWork.computeEnergy) { @@ -547,7 +547,7 @@ void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads"); - wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::NonbondedFep); #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) for (gmx::index th = 0; th < nbl_fep.ssize(); th++) { @@ -640,5 +640,5 @@ void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]); } } - wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NonbondedFep); } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 62febc366f..ecc0ecc2ca 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -147,27 +147,27 @@ void nonbonded_verlet_t::setAtomProperties(gmx::ArrayRef atomTypes, void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality locality, gmx::ArrayRef coordinates) { - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); + wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBXBufOps); nbnxn_atomdata_copy_x_to_nbat_x( pairSearch_->gridSet(), locality, as_rvec_array(coordinates.data()), nbat.get()); - wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBXBufOps); + wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps); } void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality, DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice) { - wallcycle_start(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS); + wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps); nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, gpu_nbv, d_x, xReadyOnDevice); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } gmx::ArrayRef nonbonded_verlet_t::getGridIndices() const @@ -186,13 +186,13 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality local return; } - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); + wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBFBufOps); reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data())); - wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBFBufOps); + wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps); } int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp index f3e4dee503..98d05e13b1 100644 --- a/src/gromacs/nbnxm/prunekerneldispatch.cpp +++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp @@ -100,8 +100,8 @@ void nonbonded_verlet_t::dispatchPruneKernelCpu(const gmx::InteractionLocality i void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step) { - wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded); const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % (2 * pairlistSets().params().mtsFactor) == 0); @@ -111,6 +111,6 @@ void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step) stepIsEven ? gmx::InteractionLocality::Local : gmx::InteractionLocality::NonLocal, pairlistSets().params().numRollingPruningParts); - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } diff --git a/src/gromacs/swap/swapcoords.cpp b/src/gromacs/swap/swapcoords.cpp index 11c152313c..9414963873 100644 --- a/src/gromacs/swap/swapcoords.cpp +++ b/src/gromacs/swap/swapcoords.cpp @@ -2053,7 +2053,7 @@ gmx_bool do_swapcoords(t_commrec* cr, rvec com_solvent, com_particle; /* solvent and swap molecule's center of mass */ - wallcycle_start(wcycle, ewcSWAP); + wallcycle_start(wcycle, WallCycleCounter::Swap); set_pbc(s->pbc, ir->pbcType, box); @@ -2242,7 +2242,7 @@ gmx_bool do_swapcoords(t_commrec* cr, } /* end of if(bSwap) */ - wallcycle_stop(wcycle, ewcSWAP); + wallcycle_stop(wcycle, WallCycleCounter::Swap); return bSwap; } diff --git a/src/gromacs/timing/tests/timing.cpp b/src/gromacs/timing/tests/timing.cpp index 158d6aba69..3155ad4e14 100644 --- a/src/gromacs/timing/tests/timing.cpp +++ b/src/gromacs/timing/tests/timing.cpp @@ -44,6 +44,7 @@ #include "config.h" #include +#include #include #include "gromacs/timing/cyclecounter.h" @@ -72,31 +73,30 @@ class TimingTest : public ::testing::Test { public: TimingTest() : wcycle(wallcycle_init(nullptr, 0, nullptr)) {} - ~TimingTest() override { wallcycle_destroy(wcycle); } protected: - const int delayInMilliseconds = 1; - gmx_wallcycle_t wcycle; + const int delayInMilliseconds = 1; + std::unique_ptr wcycle; }; //! Test whether the we can run the cycle counter. TEST_F(TimingTest, RunWallCycle) { - int probe = 0, ref = 1; - int n1, n2; - double c1, c2; + WallCycleCounter probe = WallCycleCounter::Run, ref = WallCycleCounter::Step; + int n1, n2; + double c1, c2; //! credit cycles from enclosing call to the ref field of wcycle - wallcycle_start(wcycle, ref); + wallcycle_start(wcycle.get(), ref); //! cycles from the probe call - wallcycle_start(wcycle, probe); + wallcycle_start(wcycle.get(), probe); sleepForMilliseconds(delayInMilliseconds); - wallcycle_stop(wcycle, probe); - wallcycle_stop(wcycle, ref); + wallcycle_stop(wcycle.get(), probe); + wallcycle_stop(wcycle.get(), ref); //! extract both - wallcycle_get(wcycle, probe, &n1, &c1); - wallcycle_get(wcycle, ref, &n2, &c2); + wallcycle_get(wcycle.get(), probe, &n1, &c1); + wallcycle_get(wcycle.get(), ref, &n2, &c2); EXPECT_EQ(n1, n2); EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3)); @@ -108,17 +108,17 @@ TEST_F(TimingTest, RunWallCycleSub) { if (useCycleSubcounters) { - int probe = 0; - int ref = 1; - int n1, n2; - double c1, c2; - wallcycle_sub_start(wcycle, ref); - wallcycle_sub_start(wcycle, probe); + WallCycleSubCounter probe = WallCycleSubCounter::DDRedist; + WallCycleSubCounter ref = WallCycleSubCounter::DDGrid; + int n1, n2; + double c1, c2; + wallcycle_sub_start(wcycle.get(), ref); + wallcycle_sub_start(wcycle.get(), probe); sleepForMilliseconds(delayInMilliseconds); - wallcycle_sub_stop(wcycle, probe); - wallcycle_sub_stop(wcycle, ref); - wallcycle_sub_get(wcycle, probe, &n1, &c1); - wallcycle_sub_get(wcycle, ref, &n2, &c2); + wallcycle_sub_stop(wcycle.get(), probe); + wallcycle_sub_stop(wcycle.get(), ref); + wallcycle_sub_get(wcycle.get(), probe, &n1, &c1); + wallcycle_sub_get(wcycle.get(), ref, &n2, &c2); EXPECT_EQ(n1, n2); EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3)); diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index 4ea520268c..6f49958713 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -44,6 +44,7 @@ #include #include +#include #include #include "gromacs/math/functions.h" @@ -51,12 +52,15 @@ #include "gromacs/timing/cyclecounter.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/timing/wallcyclereporting.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/gmxmpi.h" #include "gromacs/utility/logger.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/snprintf.h" +#include "gromacs/utility/stringutil.h" //! Whether wallcycle debugging is enabled constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0); @@ -71,87 +75,97 @@ constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */; /* Each name should not exceed 19 printing characters (ie. terminating null can be twentieth) */ -static const char* wcn[ewcNR] = { "Run", - "Step", - "PP during PME", - "Domain decomp.", - "DD comm. load", - "DD comm. bounds", - "Vsite constr.", - "Send X to PME", - "Neighbor search", - "Launch GPU ops.", - "Comm. coord.", - "Force", - "Wait + Comm. F", - "PME mesh", - "PME redist. X/F", - "PME spread", - "PME gather", - "PME 3D-FFT", - "PME 3D-FFT Comm.", - "PME solve LJ", - "PME solve Elec", - "PME wait for PP", - "Wait + Recv. PME F", - "Wait PME GPU spread", - "PME 3D-FFT", - "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */ - "Wait PME GPU gather", - "Wait Bonded GPU", - "Reduce GPU PME F", - "Wait GPU NB nonloc.", - "Wait GPU NB local", - "Wait GPU state copy", - "NB X/F buffer ops.", - "Vsite spread", - "COM pull force", - "AWH", - "Write traj.", - "Update", - "Constraints", - "Comm. energies", - "Enforced rotation", - "Add rot. forces", - "Position swapping", - "IMD", - "Test" }; - -static const char* wcsn[ewcsNR] = { - "DD redist.", - "DD NS grid + sort", - "DD setup comm.", - "DD make top.", - "DD make constr.", - "DD top. other", - "DD GPU ops.", - "NS grid local", - "NS grid non-loc.", - "NS search local", - "NS search non-loc.", - "Bonded F", - "Bonded-FEP F", - "Restraints F", - "Listed buffer ops.", - "Nonbonded pruning", - "Nonbonded F kernel", - "Nonbonded F clear", - "Nonbonded FEP", - "Launch NB GPU tasks", - "Launch Bonded GPU tasks", - "Launch PME GPU tasks", - "Launch state copy", - "Ewald F correction", - "NB X buffer ops.", - "NB F buffer ops.", - "Clear force buffer", - "Launch GPU NB X buffer ops.", - "Launch GPU NB F buffer ops.", - "Launch GPU Comm. coord.", - "Launch GPU Comm. force.", - "Launch GPU update", - "Test subcounter", -}; +static const char* enumValuetoString(WallCycleCounter enumValue) +{ + constexpr gmx::EnumerationArray wallCycleCounterNames = { + "Run", + "Step", + "PP during PME", + "Domain decomp.", + "DD comm. load", + "DD comm. bounds", + "Vsite constr.", + "Send X to PME", + "Neighbor search", + "Launch GPU ops.", + "Comm. coord.", + "Force", + "Wait + Comm. F", + "PME mesh", + "PME redist. X/F", + "PME spread", + "PME gather", + "PME 3D-FFT", + "PME 3D-FFT Comm.", + "PME solve LJ", + "PME solve Elec", + "PME wait for PP", + "Wait + Recv. PME F", + "Wait PME GPU spread", + "PME 3D-FFT", + "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */ + "Wait PME GPU gather", + "Wait Bonded GPU", + "Reduce GPU PME F", + "Wait GPU NB nonloc.", + "Wait GPU NB local", + "Wait GPU state copy", + "NB X/F buffer ops.", + "Vsite spread", + "COM pull force", + "AWH", + "Write traj.", + "Update", + "Constraints", + "Comm. energies", + "Enforced rotation", + "Add rot. forces", + "Position swapping", + "IMD", + "Test" + }; + return wallCycleCounterNames[enumValue]; +} + +static const char* enumValuetoString(WallCycleSubCounter enumValue) +{ + constexpr gmx::EnumerationArray wallCycleSubCounterNames = { + "DD redist.", + "DD NS grid + sort", + "DD setup comm.", + "DD make top.", + "DD make constr.", + "DD top. other", + "DD GPU ops.", + "NS grid local", + "NS grid non-loc.", + "NS search local", + "NS search non-loc.", + "Bonded F", + "Bonded-FEP F", + "Restraints F", + "Listed buffer ops.", + "Nonbonded pruning", + "Nonbonded F kernel", + "Nonbonded F clear", + "Nonbonded FEP", + "Launch NB GPU tasks", + "Launch Bonded GPU tasks", + "Launch PME GPU tasks", + "Launch state copy", + "Ewald F correction", + "NB X buffer ops.", + "NB F buffer ops.", + "Clear force buffer", + "Launch GPU NB X buffer ops.", + "Launch GPU NB F buffer ops.", + "Launch GPU Comm. coord.", + "Launch GPU Comm. force.", + "Launch GPU update", + "Test subcounter" + }; + return wallCycleSubCounterNames[enumValue]; +} /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */ static const char* enumValuetoString(PmeStage enumValue) @@ -168,23 +182,22 @@ bool wallcycle_have_counter() return gmx_cycles_have_counter(); } -gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr) +std::unique_ptr wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr) { - gmx_wallcycle_t wc; + std::unique_ptr wc; if (!wallcycle_have_counter()) { - return nullptr; + return wc; } - snew(wc, 1); + wc = std::make_unique(); - wc->haveInvalidCount = FALSE; - wc->wc_barrier = FALSE; - wc->wcc_all = nullptr; + wc->haveInvalidCount = false; + wc->wc_barrier = false; wc->wc_depth = 0; - wc->ewc_prev = -1; + wc->ewc_prev = WallCycleCounter::Count; wc->reset_counters = resetstep; wc->cr = cr; @@ -196,23 +209,17 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr) { fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n"); } - wc->wc_barrier = TRUE; + wc->wc_barrier = true; } #endif - snew(wc->wcc, ewcNR); if (getenv("GMX_CYCLE_ALL") != nullptr) { if (fplog) { fprintf(fplog, "\nWill time all the code during the run\n\n"); } - snew(wc->wcc_all, ewcNR * ewcNR); - } - - if (sc_useCycleSubcounters) - { - snew(wc->wcsc, ewcsNR); + wc->wcc_all.resize(sc_numWallCycleCountersSquared); } #if DEBUG_WCYCLE @@ -223,32 +230,10 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr) return wc; } -void wallcycle_destroy(gmx_wallcycle_t wc) -{ - if (wc == nullptr) - { - return; - } - - if (wc->wcc != nullptr) - { - sfree(wc->wcc); - } - if (wc->wcc_all != nullptr) - { - sfree(wc->wcc_all); - } - if (wc->wcsc != nullptr) - { - sfree(wc->wcsc); - } - sfree(wc); -} - #if DEBUG_WCYCLE -static void debug_start_check(gmx_wallcycle_t wc, int ewc) +static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc) { - if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX) + if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth) { gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1); } @@ -258,41 +243,44 @@ static void debug_start_check(gmx_wallcycle_t wc, int ewc) if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank)) { std::string indentStr(4 * wc->count_depth, ' '); - fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]); + fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc)); } } -static void debug_stop_check(gmx_wallcycle_t wc, int ewc) +static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc) { if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank)) { std::string indentStr(4 * wc->count_depth, ' '); - fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]); + fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc)); } wc->count_depth--; if (wc->count_depth < 0) { - gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth); + gmx_fatal(FARGS, + "wallcycle counter depth out of range when stopping %s: %d", + enumValuetoString(ewc), + wc->count_depth); } if (wc->counterlist[wc->count_depth] != ewc) { gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s", - wcn[wc->counterlist[wc->count_depth]], - wcn[ewc]); + enumValuetoString(wc->counterlist[wc->count_depth]), + enumValuetoString(ewc)); } } #endif -void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c) +void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c) { *n = wc->wcc[ewc].n; *c = static_cast(wc->wcc[ewc].c); } -void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c) +void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c) { if (sc_useCycleSubcounters && wc != nullptr) { @@ -301,48 +289,43 @@ void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c) } } -void wallcycle_reset_all(gmx_wallcycle_t wc) +void wallcycle_reset_all(gmx_wallcycle* wc) { - int i; - if (wc == nullptr) { return; } - for (i = 0; i < ewcNR; i++) + for (auto& counter : wc->wcc) { - wc->wcc[i].n = 0; - wc->wcc[i].c = 0; + counter.n = 0; + counter.c = 0; } - wc->haveInvalidCount = FALSE; + wc->haveInvalidCount = false; - if (wc->wcc_all) + if (!wc->wcc_all.empty()) { - for (i = 0; i < ewcNR * ewcNR; i++) + for (int i = 0; i < sc_numWallCycleCountersSquared; i++) { wc->wcc_all[i].n = 0; wc->wcc_all[i].c = 0; } } - if (wc->wcsc) + for (auto& counter : wc->wcsc) { - for (i = 0; i < ewcsNR; i++) - { - wc->wcsc[i].n = 0; - wc->wcsc[i].c = 0; - } + counter.n = 0; + counter.c = 0; } } -static gmx_bool is_pme_counter(int ewc) +static bool is_pme_counter(WallCycleCounter ewc) { - return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM); + return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm); } -static gmx_bool is_pme_subcounter(int ewc) +static bool is_pme_subcounter(WallCycleCounter ewc) { - return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM); + return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm); } void wallcycleBarrier(gmx_wallcycle* wc) @@ -358,7 +341,10 @@ void wallcycleBarrier(gmx_wallcycle* wc) } /* Subtract counter ewc_sub timed inside a timing block for ewc_main */ -static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub) +// NOLINTNEXTLINE(google-runtime-references) +static void subtract_cycles(gmx::EnumerationArray& wcc, + WallCycleCounter ewc_main, + WallCycleCounter ewc_sub) { if (wcc[ewc_sub].n > 0) { @@ -374,45 +360,47 @@ static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub) } } -void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme) +void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme) { if (wc == nullptr) { return; } - for (int i = 0; i < ewcNR; i++) + for (auto key : keysOf(wc->wcc)) { - if (is_pme_counter(i) || (i == ewcRUN && isPmeRank)) + if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank)) { - wc->wcc[i].c *= nthreads_pme; + wc->wcc[key].c *= nthreads_pme; - if (wc->wcc_all) + if (!wc->wcc_all.empty()) { - for (int j = 0; j < ewcNR; j++) + const int current = static_cast(key); + for (int j = 0; j < sc_numWallCycleCounters; j++) { - wc->wcc_all[i * ewcNR + j].c *= nthreads_pme; + wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme; } } } else { - wc->wcc[i].c *= nthreads_pp; + wc->wcc[key].c *= nthreads_pp; - if (wc->wcc_all) + if (!wc->wcc_all.empty()) { - for (int j = 0; j < ewcNR; j++) + const int current = static_cast(key); + for (int j = 0; j < sc_numWallCycleCounters; j++) { - wc->wcc_all[i * ewcNR + j].c *= nthreads_pp; + wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp; } } } } - if (sc_useCycleSubcounters && wc->wcsc && !isPmeRank) + if (sc_useCycleSubcounters && !isPmeRank) { - for (int i = 0; i < ewcsNR; i++) + for (auto counter : wc->wcsc) { - wc->wcsc[i].c *= nthreads_pp; + counter.c *= nthreads_pp; } } } @@ -429,16 +417,15 @@ void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthr * wcc_all are unused by the GPU reporting, but it is not satisfactory * for the future. Also, there's no need for MPI_Allreduce, since * only MASTERRANK uses any of the results. */ -WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc) +WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc) { - WallcycleCounts cycles_sum; - wallcc_t* wcc; - double cycles[int(ewcNR) + int(ewcsNR)]; + WallcycleCounts cycles_sum; + gmx::EnumerationArray cyclesMain; + gmx::EnumerationArray cyclesSub; #if GMX_MPI - double cycles_n[int(ewcNR) + int(ewcsNR) + 1]; + gmx::EnumerationArray cyclesMainOnNode; + gmx::EnumerationArray cyclesSubOnNode; #endif - int i; - int nsum; if (wc == nullptr) { @@ -448,104 +435,128 @@ WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc) return cycles_sum; } - wcc = wc->wcc; + auto& wcc = wc->wcc; - subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD); - subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND); + subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad); + subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound); - subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM); + subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm); if (cr->npmenodes == 0) { /* All nodes do PME (or no PME at all) */ - subtract_cycles(wcc, ewcFORCE, ewcPMEMESH); + subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh); } else { /* The are PME-only nodes */ - if (wcc[ewcPMEMESH].n > 0) + if (wcc[WallCycleCounter::PmeMesh].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ - GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, + GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c, "Total run ticks must be greater than PME-only ticks"); - wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c; + wcc[WallCycleCounter::PmeWaitComm].c = + wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c; } } /* Store the cycles in a double buffer for summing */ - for (i = 0; i < ewcNR; i++) + for (auto key : keysOf(wcc)) { #if GMX_MPI - cycles_n[i] = static_cast(wcc[i].n); + cyclesMainOnNode[key] = static_cast(wcc[key].n); #endif - cycles[i] = static_cast(wcc[i].c); + cyclesMain[key] = static_cast(wcc[key].c); } - nsum = ewcNR; - if (wc->wcsc) + if (sc_useCycleSubcounters) { - for (i = 0; i < ewcsNR; i++) + for (auto key : keysOf(wc->wcsc)) { #if GMX_MPI - cycles_n[ewcNR + i] = static_cast(wc->wcsc[i].n); + cyclesSubOnNode[key] = static_cast(wc->wcsc[key].n); #endif - cycles[ewcNR + i] = static_cast(wc->wcsc[i].c); + cyclesSub[key] = static_cast(wc->wcsc[key].c); } - nsum += ewcsNR; } #if GMX_MPI if (cr->nnodes > 1) { - double buf[int(ewcNR) + int(ewcsNR) + 1]; + gmx::EnumerationArray bufMain; + gmx::EnumerationArray bufSub; // TODO this code is used only at the end of the run, so we // can just do a simple reduce of haveInvalidCount in // wallcycle_print, and avoid bugs - cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0); + double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0); // TODO Use MPI_Reduce - MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); - for (i = 0; i < ewcNR; i++) + MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); + if (sc_useCycleSubcounters) + { + MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); + } + MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); + for (auto key : keysOf(wcc)) { - wcc[i].n = gmx::roundToInt(buf[i]); + wcc[key].n = gmx::roundToInt(bufMain[key]); } - wc->haveInvalidCount = (buf[nsum] > 0); - if (wc->wcsc) + wc->haveInvalidCount = (haveInvalidCount > 0); + if (sc_useCycleSubcounters) { - for (i = 0; i < ewcsNR; i++) + for (auto key : keysOf(wc->wcsc)) { - wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]); + wc->wcsc[key].n = gmx::roundToInt(bufSub[key]); } } // TODO Use MPI_Reduce - MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); + MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); + if (sc_useCycleSubcounters) + { + MPI_Allreduce(cyclesSub.data(), + cycles_sum.data() + sc_numWallCycleCounters, + cyclesSub.size(), + MPI_DOUBLE, + MPI_SUM, + cr->mpi_comm_mysim); + } - if (wc->wcc_all != nullptr) + if (!wc->wcc_all.empty()) { - double *buf_all, *cyc_all; + std::array cyc_all; + std::array buf_all; - snew(cyc_all, ewcNR * ewcNR); - snew(buf_all, ewcNR * ewcNR); - for (i = 0; i < ewcNR * ewcNR; i++) + for (int i = 0; i < sc_numWallCycleCountersSquared; i++) { cyc_all[i] = wc->wcc_all[i].c; } // TODO Use MPI_Reduce - MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); - for (i = 0; i < ewcNR * ewcNR; i++) + MPI_Allreduce(cyc_all.data(), + buf_all.data(), + sc_numWallCycleCountersSquared, + MPI_DOUBLE, + MPI_SUM, + cr->mpi_comm_mysim); + for (int i = 0; i < sc_numWallCycleCountersSquared; i++) { wc->wcc_all[i].c = static_cast(buf_all[i]); } - sfree(buf_all); - sfree(cyc_all); } } else #endif { - for (i = 0; i < nsum; i++) + for (auto key : keysOf(cyclesMain)) { - cycles_sum[i] = cycles[i]; + cycles_sum[static_cast(key)] = cyclesMain[key]; + } + if (sc_useCycleSubcounters) + { + for (auto key : keysOf(cyclesSub)) + { + const int offset = static_cast(key) + sc_numWallCycleCounters; + cycles_sum[offset] = cyclesSub[key]; + } } } @@ -669,14 +680,14 @@ void wallcycle_print(FILE* fplog, int nth_pp, int nth_pme, double realtime, - gmx_wallcycle_t wc, + gmx_wallcycle* wc, const WallcycleCounts& cyc_sum, const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t, const gmx_wallclock_gpu_pme_t* gpu_pme_t) { double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio; double c2t, c2t_pp, c2t_pme = 0; - int i, j, npp, nth_tot; + int npp, nth_tot; char buf[STRLEN]; const char* hline = "-----------------------------------------------------------------------------"; @@ -699,7 +710,7 @@ void wallcycle_print(FILE* fplog, /* When using PME-only nodes, the next line is valid for both PP-only and PME-only nodes because they started ewcRUN at the same time. */ - tot = cyc_sum[ewcRUN]; + tot = cyc_sum[static_cast(WallCycleCounter::Run)]; tot_for_pp = 0; if (tot <= 0.0) @@ -749,44 +760,63 @@ void wallcycle_print(FILE* fplog, print_header(fplog, npp, nth_pp, npme, nth_pme); fprintf(fplog, "%s\n", hline); - for (i = ewcPPDURINGPME + 1; i < ewcNR; i++) + gmx::EnumerationWrapper iter; + for (auto key = gmx::EnumerationIterator(WallCycleCounter::Domdec); + key != iter.end(); + ++key) { - if (is_pme_subcounter(i)) + + if (is_pme_subcounter(*key)) { /* Do not count these at all */ } - else if (npme > 0 && is_pme_counter(i)) + else if (npme > 0 && is_pme_counter(*key)) { /* Print timing information for PME-only nodes, but add an * asterisk so the reader of the table can know that the * walltimes are not meant to add up. The asterisk still * fits in the required maximum of 19 characters. */ - char buffer[STRLEN]; - snprintf(buffer, STRLEN, "%s *", wcn[i]); - print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); + std::string message = gmx::formatString("%s *", enumValuetoString(*key)); + print_cycles(fplog, + c2t_pme, + message.c_str(), + npme, + nth_pme, + wc->wcc[*key].n, + cyc_sum[static_cast(*key)], + tot); } else { /* Print timing information when it is for a PP or PP+PME node */ - print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot); - tot_for_pp += cyc_sum[i]; + print_cycles(fplog, + c2t_pp, + enumValuetoString(*key), + npp, + nth_pp, + wc->wcc[*key].n, + cyc_sum[static_cast(*key)], + tot); + tot_for_pp += cyc_sum[static_cast(*key)]; } } - if (wc->wcc_all != nullptr) + if (!wc->wcc_all.empty()) { - for (i = 0; i < ewcNR; i++) + for (auto i : keysOf(wc->wcc)) { - for (j = 0; j < ewcNR; j++) + const int countI = static_cast(i); + for (auto j : keysOf(wc->wcc)) { - snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]); + const int countJ = static_cast(j); + snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j)); print_cycles(fplog, c2t_pp, buf, npp, nth_pp, - wc->wcc_all[i * ewcNR + j].n, - wc->wcc_all[i * ewcNR + j].c, + wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n, + wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c, tot); } } @@ -806,16 +836,18 @@ void wallcycle_print(FILE* fplog, hline); } - if (wc->wcc[ewcPMEMESH].n > 0) + if (wc->wcc[WallCycleCounter::PmeMesh].n > 0) { // A workaround to not print breakdown when no subcounters were recorded. // TODO: figure out and record PME GPU counters (what to do with the waiting ones?) - std::vector validPmeSubcounterIndices; - for (i = ewcPPDURINGPME + 1; i < ewcNR; i++) + std::vector validPmeSubcounterIndices; + for (auto key = gmx::EnumerationIterator(WallCycleCounter::Domdec); + key != iter.end(); + key++) { - if (is_pme_subcounter(i) && wc->wcc[i].n > 0) + if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0) { - validPmeSubcounterIndices.push_back(i); + validPmeSubcounterIndices.push_back(*key); } } @@ -827,24 +859,31 @@ void wallcycle_print(FILE* fplog, { print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, - wcn[i], + enumValuetoString(i), npme > 0 ? npme : npp, nth_pme, wc->wcc[i].n, - cyc_sum[i], + cyc_sum[static_cast(i)], tot); } fprintf(fplog, "%s\n", hline); } } - if (sc_useCycleSubcounters && wc->wcsc) + if (sc_useCycleSubcounters) { fprintf(fplog, " Breakdown of PP computation\n"); fprintf(fplog, "%s\n", hline); - for (i = 0; i < ewcsNR; i++) + for (auto key : keysOf(wc->wcsc)) { - print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot); + print_cycles(fplog, + c2t_pp, + enumValuetoString(key), + npp, + nth_pp, + wc->wcsc[key].n, + cyc_sum[sc_numWallCycleCounters + static_cast(key)], + tot); } fprintf(fplog, "%s\n", hline); } @@ -865,19 +904,19 @@ void wallcycle_print(FILE* fplog, tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t; /* add up the kernel timings */ - for (i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) + for (int j = 0; j < 2; j++) { tot_gpu += gpu_nbnxn_t->ktime[i][j].t; } } tot_gpu += gpu_nbnxn_t->pruneTime.t; - tot_cpu_overlap = wc->wcc[ewcFORCE].c; - if (wc->wcc[ewcPMEMESH].n > 0) + tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c; + if (wc->wcc[WallCycleCounter::PmeMesh].n > 0) { - tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; + tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c; } tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */ @@ -889,9 +928,9 @@ void wallcycle_print(FILE* fplog, print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu); - for (i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) + for (int j = 0; j < 2; j++) { if (gpu_nbnxn_t->ktime[i][j].c) { @@ -939,18 +978,18 @@ void wallcycle_print(FILE* fplog, fprintf(fplog, "%s\n", hline); } gpu_cpu_ratio = tot_gpu / tot_cpu_overlap; - if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0) + if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0) { fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = " "%.3f\n", tot_gpu / gpu_nbnxn_t->nb_c, - tot_cpu_overlap / wc->wcc[ewcFORCE].n, + tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n, gpu_cpu_ratio); } /* only print notes related to CPU-GPU load balance with PME */ - if (wc->wcc[ewcPMEMESH].n > 0) + if (wc->wcc[WallCycleCounter::PmeMesh].n > 0) { fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n"); @@ -1011,10 +1050,12 @@ void wallcycle_print(FILE* fplog, "call, so timings are not those of real runs."); } - if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1)) + if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0 + && (cyc_sum[static_cast(WallCycleCounter::Domdec)] > tot * 0.1 + || cyc_sum[static_cast(WallCycleCounter::NS)] > tot * 0.1)) { /* Only the sim master calls this function, so always print to stderr */ - if (wc->wcc[ewcDOMDEC].n == 0) + if (wc->wcc[WallCycleCounter::Domdec].n == 0) { GMX_LOG(mdlog.warning) .asParagraph() @@ -1022,7 +1063,7 @@ void wallcycle_print(FILE* fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on " "accuracy)\n", - gmx::roundToInt(100 * cyc_sum[ewcNS] / tot)); + gmx::roundToInt(100 * cyc_sum[static_cast(WallCycleCounter::NS)] / tot)); } else { @@ -1033,38 +1074,36 @@ void wallcycle_print(FILE* fplog, " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on " "accuracy)\n", - gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot), - gmx::roundToInt(100 * cyc_sum[ewcNS] / tot)); + gmx::roundToInt(100 * cyc_sum[static_cast(WallCycleCounter::Domdec)] / tot), + gmx::roundToInt(100 * cyc_sum[static_cast(WallCycleCounter::NS)] / tot)); } } - if (cyc_sum[ewcMoveE] > tot * 0.05) + if (cyc_sum[static_cast(WallCycleCounter::MoveE)] > tot * 0.05) { GMX_LOG(mdlog.warning) .asParagraph() .appendTextFormatted( "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to increase some nst* mdp options\n", - gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot)); + gmx::roundToInt(100 * cyc_sum[static_cast(WallCycleCounter::MoveE)] / tot)); } } -extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc) +int64_t wcycle_get_reset_counters(gmx_wallcycle* wc) { if (wc == nullptr) { return -1; } - return wc->reset_counters; } -extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters) +void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters) { if (wc == nullptr) { return; } - wc->reset_counters = reset_counters; } diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 133b90216c..f68035f2a5 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -43,12 +43,24 @@ #include +#include +#include +#include + #include "gromacs/timing/cyclecounter.h" #include "gromacs/utility/basedefinitions.h" +#include "gromacs/utility/enumerationhelpers.h" + +#ifndef DEBUG_WCYCLE +/*! \brief Enables consistency checking for the counters. + * + * If the macro is set to 1, code checks if you stop a counter different from the last + * one that was opened and if you do nest too deep. + */ +# define DEBUG_WCYCLE 0 +#endif -typedef struct gmx_wallcycle* gmx_wallcycle_t; struct t_commrec; -static constexpr gmx_wallcycle* nullWallcycle = nullptr; #ifndef DEBUG_WCYCLE /*! \brief Enables consistency checking for the counters. @@ -59,95 +71,98 @@ static constexpr gmx_wallcycle* nullWallcycle = nullptr; # define DEBUG_WCYCLE 0 #endif -enum +enum class WallCycleCounter : int { - ewcRUN, - ewcSTEP, - ewcPPDURINGPME, - ewcDOMDEC, - ewcDDCOMMLOAD, - ewcDDCOMMBOUND, - ewcVSITECONSTR, - ewcPP_PMESENDX, - ewcNS, - ewcLAUNCH_GPU, - ewcMOVEX, - ewcFORCE, - ewcMOVEF, - ewcPMEMESH, - ewcPME_REDISTXF, - ewcPME_SPREAD, - ewcPME_GATHER, - ewcPME_FFT, - ewcPME_FFTCOMM, - ewcLJPME, - ewcPME_SOLVE, - ewcPMEWAITCOMM, - ewcPP_PMEWAITRECVF, - ewcWAIT_GPU_PME_SPREAD, - ewcPME_FFT_MIXED_MODE, - ewcPME_SOLVE_MIXED_MODE, - ewcWAIT_GPU_PME_GATHER, - ewcWAIT_GPU_BONDED, - ewcPME_GPU_F_REDUCTION, - ewcWAIT_GPU_NB_NL, - ewcWAIT_GPU_NB_L, - ewcWAIT_GPU_STATE_PROPAGATOR_DATA, - ewcNB_XF_BUF_OPS, - ewcVSITESPREAD, - ewcPULLPOT, - ewcAWH, - ewcTRAJ, - ewcUPDATE, - ewcCONSTR, - ewcMoveE, - ewcROT, - ewcROTadd, - ewcSWAP, - ewcIMD, - ewcTEST, - ewcNR + Run, + Step, + PpDuringPme, + Domdec, + DDCommLoad, + DDCommBound, + VsiteConstr, + PpPmeSendX, + NS, + LaunchGpu, + MoveX, + Force, + MoveF, + PmeMesh, + PmeRedistXF, + PmeSpread, + PmeGather, + PmeFft, + PmeFftComm, + LJPme, + PmeSolve, + PmeWaitComm, + PpPmeWaitRecvF, + WaitGpuPmeSpread, + PmeFftMixedMode, + PmeSolveMixedMode, + WaitGpuPmeGather, + WaitGpuBonded, + PmeGpuFReduction, + WaitGpuNbNL, + WaitGpuNbL, + WaitGpuStatePropagatorData, + NbXFBufOps, + VsiteSpread, + PullPot, + Awh, + Traj, + Update, + Constr, + MoveE, + Rot, + RotAdd, + Swap, + Imd, + Test, + Count }; -enum +enum class WallCycleSubCounter : int { - ewcsDD_REDIST, - ewcsDD_GRID, - ewcsDD_SETUPCOMM, - ewcsDD_MAKETOP, - ewcsDD_MAKECONSTR, - ewcsDD_TOPOTHER, - ewcsDD_GPU, - ewcsNBS_GRID_LOCAL, - ewcsNBS_GRID_NONLOCAL, - ewcsNBS_SEARCH_LOCAL, - ewcsNBS_SEARCH_NONLOCAL, - ewcsLISTED, - ewcsLISTED_FEP, - ewcsRESTRAINTS, - ewcsLISTED_BUF_OPS, - ewcsNONBONDED_PRUNING, - ewcsNONBONDED_KERNEL, - ewcsNONBONDED_CLEAR, - ewcsNONBONDED_FEP, - ewcsLAUNCH_GPU_NONBONDED, - ewcsLAUNCH_GPU_BONDED, - ewcsLAUNCH_GPU_PME, - ewcsLAUNCH_STATE_PROPAGATOR_DATA, - ewcsEWALD_CORRECTION, - ewcsNB_X_BUF_OPS, - ewcsNB_F_BUF_OPS, - ewcsCLEAR_FORCE_BUFFER, - ewcsLAUNCH_GPU_NB_X_BUF_OPS, - ewcsLAUNCH_GPU_NB_F_BUF_OPS, - ewcsLAUNCH_GPU_MOVEX, - ewcsLAUNCH_GPU_MOVEF, - ewcsLAUNCH_GPU_UPDATE_CONSTRAIN, - ewcsTEST, - ewcsNR + DDRedist, + DDGrid, + DDSetupComm, + DDMakeTop, + DDMakeConstr, + DDTopOther, + DDGpu, + NBSGridLocal, + NBSGridNonLocal, + NBSSearchLocal, + NBSSearchNonLocal, + Listed, + ListedFep, + Restraints, + ListedBufOps, + NonbondedPruning, + NonbondedKernel, + NonbondedClear, + NonbondedFep, + LaunchGpuNonBonded, + LaunchGpuBonded, + LaunchGpuPme, + LaunchStatePropagatorData, + EwaldCorrection, + NBXBufOps, + NBFBufOps, + ClearForceBuffer, + LaunchGpuNBXBufOps, + LaunchGpuNBFBufOps, + LaunchGpuMoveX, + LaunchGpuMoveF, + LaunchGpuUpdateConstrain, + Test, + Count }; -static constexpr const bool sc_useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS; +static constexpr int sc_numWallCycleCounters = static_cast(WallCycleCounter::Count); +static constexpr int sc_numWallCycleSubCounters = static_cast(WallCycleSubCounter::Count); +static constexpr int sc_numWallCycleCountersSquared = sc_numWallCycleCounters * sc_numWallCycleCounters; +static constexpr bool sc_useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS; struct wallcc_t { @@ -163,57 +178,53 @@ static constexpr int c_MaxWallCycleDepth = 6; struct gmx_wallcycle { - wallcc_t* wcc; + gmx::EnumerationArray wcc; /* did we detect one or more invalid cycle counts */ bool haveInvalidCount; /* variables for testing/debugging */ - bool wc_barrier; - wallcc_t* wcc_all; - int wc_depth; + bool wc_barrier; + std::vector wcc_all; + int wc_depth; #if DEBUG_WCYCLE - int* counterlist; - int count_depth; - bool isMasterRank; + std::array counterlist; + int count_depth; + bool isMasterRank; #endif - int ewc_prev; - gmx_cycles_t cycle_prev; - int64_t reset_counters; - const t_commrec* cr; - wallcc_t* wcsc; + WallCycleCounter ewc_prev; + gmx_cycles_t cycle_prev; + int64_t reset_counters; + const t_commrec* cr; + gmx::EnumerationArray wcsc; }; -//! Returns whether cycle counting is supported. +//! Returns if cycle counting is supported bool wallcycle_have_counter(); -/*! \brief - * Returns a wallcycle datastructure. - * - * If cycle counting is not supported, returns nullptr instead. - */ -gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr); - -//! Cleans up wallcycle structure. -void wallcycle_destroy(gmx_wallcycle_t wc); +//! Returns the wall cycle structure. +std::unique_ptr wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr); //! Adds custom barrier for wallcycle counting. void wallcycleBarrier(gmx_wallcycle* wc); -inline void wallcycle_all_start(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle) +void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c); +/* Returns the cumulative count and sub cycle count for ewcs */ + +inline void wallcycle_all_start(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle) { wc->ewc_prev = ewc; wc->cycle_prev = cycle; } -inline void wallcycle_all_stop(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle) +inline void wallcycle_all_stop(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle) { - const int prev = wc->ewc_prev; - const int current = ewc; - wc->wcc_all[prev * ewcNR + current].n += 1; - wc->wcc_all[prev * ewcNR + current].c += cycle - wc->cycle_prev; + const int prev = static_cast(wc->ewc_prev); + const int current = static_cast(ewc); + wc->wcc_all[prev * sc_numWallCycleCounters + current].n += 1; + wc->wcc_all[prev * sc_numWallCycleCounters + current].c += cycle - wc->cycle_prev; } -//! Starts the cycle counter for \c ewc (and increases the call count). -inline void wallcycle_start(gmx_wallcycle_t wc, int ewc) +//! Starts the cycle counter (and increases the call count) +inline void wallcycle_start(gmx_wallcycle* wc, WallCycleCounter ewc) { if (wc == nullptr) { @@ -227,10 +238,10 @@ inline void wallcycle_start(gmx_wallcycle_t wc, int ewc) #endif gmx_cycles_t cycle = gmx_cycles_read(); wc->wcc[ewc].start = cycle; - if (wc->wcc_all) + if (!wc->wcc_all.empty()) { wc->wc_depth++; - if (ewc == ewcRUN) + if (ewc == WallCycleCounter::Run) { wallcycle_all_start(wc, ewc, cycle); } @@ -241,8 +252,8 @@ inline void wallcycle_start(gmx_wallcycle_t wc, int ewc) } } -//! Starts the cycle counter for \c ewc without increasing the call count. -inline void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc) +//! Starts the cycle counter without increasing the call count +inline void wallcycle_start_nocount(gmx_wallcycle* wc, WallCycleCounter ewc) { if (wc == nullptr) { @@ -251,8 +262,8 @@ inline void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc) wc->wcc[ewc].n++; } -//! Stop the cycle count for \c ewc, returns the last cycle count. -inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc) +//! Stop the cycle count for ewc , returns the last cycle count +inline double wallcycle_stop(gmx_wallcycle* wc, WallCycleCounter ewc) { gmx_cycles_t cycle, last; @@ -287,10 +298,10 @@ inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc) } wc->wcc[ewc].c += last; wc->wcc[ewc].n++; - if (wc->wcc_all) + if (!wc->wcc_all.empty()) { wc->wc_depth--; - if (ewc == ewcRUN) + if (ewc == WallCycleCounter::Run) { wallcycle_all_stop(wc, ewc, cycle); } @@ -303,8 +314,8 @@ inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc) return last; } -//! Only increment call count for \c ewc by one. -inline void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc) +//! Only increment call count for ewc by one +inline void wallcycle_increment_event_count(gmx_wallcycle* wc, WallCycleCounter ewc) { if (wc == nullptr) { @@ -313,26 +324,23 @@ inline void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc) wc->wcc[ewc].n++; } -//! Returns the cumulative count and cycle count for \c ewc. -void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c); - -//! Returns the cumulative count and sub cycle count for \c ewcs. -void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c); +//! Returns the cumulative count and cycle count for ewc +void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c); -//! Resets all cycle counters to zero. -void wallcycle_reset_all(gmx_wallcycle_t wc); +//! Resets all cycle counters to zero +void wallcycle_reset_all(gmx_wallcycle* wc); -//! Scale the cycle counts to reflect how many threads run for that number of cycles. -void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme); +//! Scale the cycle counts to reflect how many threads run for that number of cycles +void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme); -//! Return reset_counters. -int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc); +//! Return reset_counters from wc struct +int64_t wcycle_get_reset_counters(gmx_wallcycle* wc); -//! Set reset_counters. -void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters); +//! Set reset_counters +void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters); -//! Set the start sub cycle count for \c ewcs. -inline void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs) +//! Set the start sub cycle count for ewcs +inline void wallcycle_sub_start(gmx_wallcycle* wc, WallCycleSubCounter ewcs) { if (sc_useCycleSubcounters && wc != nullptr) { @@ -340,8 +348,8 @@ inline void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs) } } -//! Set the start sub cycle count for \c ewcs without increasing the call count. -inline void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs) +//! Set the start sub cycle count for ewcs without increasing the call count +inline void wallcycle_sub_start_nocount(gmx_wallcycle* wc, WallCycleSubCounter ewcs) { if (sc_useCycleSubcounters && wc != nullptr) { @@ -349,8 +357,8 @@ inline void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs) } } -//! Stop the sub cycle count for \c ewcs. -inline void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs) +//! Stop the sub cycle count for ewcs +inline void wallcycle_sub_stop(gmx_wallcycle* wc, WallCycleSubCounter ewcs) { if (sc_useCycleSubcounters && wc != nullptr) { diff --git a/src/gromacs/timing/wallcyclereporting.h b/src/gromacs/timing/wallcyclereporting.h index 1bf3096fea..950b5b8af6 100644 --- a/src/gromacs/timing/wallcyclereporting.h +++ b/src/gromacs/timing/wallcyclereporting.h @@ -4,7 +4,7 @@ * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2008, The GROMACS development team. * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,6 +45,7 @@ #include +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/basedefinitions.h" struct t_commrec; @@ -54,14 +55,13 @@ namespace gmx class MDLogger; } -typedef struct gmx_wallcycle* gmx_wallcycle_t; struct gmx_wallclock_gpu_nbnxn_t; struct gmx_wallclock_gpu_pme_t; -typedef std::array WallcycleCounts; +using WallcycleCounts = std::array; /* Convenience typedef */ -WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc); +WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc); /* Return a vector of the sum of cycle counts over the nodes in cr->mpi_comm_mysim. */ @@ -72,7 +72,7 @@ void wallcycle_print(FILE* fplog, int nth_pp, int nth_pme, double realtime, - gmx_wallcycle_t wc, + gmx_wallcycle* wc, const WallcycleCounts& cyc_sum, const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t, const gmx_wallclock_gpu_pme_t* gpu_pme_t); -- 2.22.0