// Put everything together
auto nbv = std::make_unique<nonbonded_verlet_t>(
- std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullWallcycle);
+ std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullptr);
gmxForceCalculator_->nbv_ = std::move(nbv);
}
GMX_ASSERT(forceWithVirial, "Need a valid ForceWithVirial object");
}
- wallcycle_start(wallcycle, ewcAWH);
+ wallcycle_start(wallcycle, WallCycleCounter::Awh);
t_pbc pbc;
set_pbc(&pbc, pbcType, box);
}
}
- wallcycle_stop(wallcycle, ewcAWH);
+ wallcycle_stop(wallcycle, WallCycleCounter::Awh);
return MASTER(commRecord_) ? static_cast<real>(awhPotential) : 0;
}
gmx_bool bUniform,
gmx_bool bDoDLB,
int64_t step,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
gmx_domdec_comm_t* comm = dd->comm;
if (bDoDLB)
{
- wallcycle_start(wcycle, ewcDDCOMMBOUND);
+ wallcycle_start(wcycle, WallCycleCounter::DDCommBound);
set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
- wallcycle_stop(wcycle, ewcDDCOMMBOUND);
+ wallcycle_stop(wcycle, WallCycleCounter::DDCommBound);
}
else if (bDynamicBox)
{
gmx_bool bUniform,
gmx_bool bDoDLB,
int64_t step,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
gmx_domdec_comm_t* comm = dd->comm;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
gmx_bool bUniform,
gmx_bool bDoDLB,
int64_t step,
- gmx_wallcycle_t wcycle);
+ gmx_wallcycle* wcycle);
#endif
void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
{
- wallcycle_start(wcycle, ewcMOVEX);
+ wallcycle_start(wcycle, WallCycleCounter::MoveX);
rvec shift = { 0, 0, 0 };
nzone += nzone;
}
- wallcycle_stop(wcycle, ewcMOVEX);
+ wallcycle_stop(wcycle, WallCycleCounter::MoveX);
}
void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
{
- wallcycle_start(wcycle, ewcMOVEF);
+ wallcycle_start(wcycle, WallCycleCounter::MoveF);
gmx::ArrayRef<gmx::RVec> f = forceWithShiftForces->force();
gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
}
nzone /= 2;
}
- wallcycle_stop(wcycle, ewcMOVEF);
+ wallcycle_stop(wcycle, WallCycleCounter::MoveF);
}
/* Convenience function for extracting a real buffer from an rvec buffer
void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
{
- wallcycle_start(wcycle_, ewcDOMDEC);
- wallcycle_sub_start(wcycle_, ewcsDD_GPU);
+ wallcycle_start(wcycle_, WallCycleCounter::Domdec);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::DDGpu);
d_x_ = d_coordinatesBuffer;
d_f_ = d_forcesBuffer;
}
#endif
- wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
- wallcycle_stop(wcycle_, ewcDOMDEC);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
+ wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
return;
}
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
- wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
if (pulse_ == 0)
{
// ensure stream waits until coordinate data is available on device
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
}
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
// launch kernel to pack send buffer
KernelLaunchConfig config;
kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
}
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
// Consider time spent in communicateHaloData as Comm.X counter
// ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
- wallcycle_start(wcycle_, ewcMOVEX);
+ wallcycle_start(wcycle_, WallCycleCounter::MoveX);
// wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
// and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
- wallcycle_stop(wcycle_, ewcMOVEX);
+ wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
return;
}
{
// Consider time spent in communicateHaloData as Comm.F counter
// ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
- wallcycle_start(wcycle_, ewcMOVEF);
+ wallcycle_start(wcycle_, WallCycleCounter::MoveF);
float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
// Communicate halo data (in non-local stream)
communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
- wallcycle_stop(wcycle_, ewcMOVEF);
+ wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
float3* d_f = d_f_;
// If this is the last pulse and index (noting the force halo
fReadyOnDevice_.markEvent(nonLocalStream_);
}
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
}
//! Compute and communicate to determine the load distribution across PP ranks.
-static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
+static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
{
gmx_domdec_comm_t* comm;
domdec_load_t* load;
fprintf(debug, "get_load_distribution start\n");
}
- wallcycle_start(wcycle, ewcDDCOMMLOAD);
+ wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
comm = dd->comm;
}
}
- wallcycle_stop(wcycle, ewcDDCOMMLOAD);
+ wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
if (debug)
{
int ncgindex_set;
char sbuf[22];
- wallcycle_start(wcycle, ewcDOMDEC);
+ wallcycle_start(wcycle, WallCycleCounter::Domdec);
gmx_domdec_t* dd = cr->dd;
gmx_domdec_comm_t* comm = dd->comm;
int ncg_moved = 0;
if (bRedist)
{
- wallcycle_sub_start(wcycle, ewcsDD_REDIST);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
ncgindex_set = dd->ncg_home;
dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
state_local->x);
}
- wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
}
RVec cell_ns_x0, cell_ns_x1;
if (bSortCG)
{
- wallcycle_sub_start(wcycle, ewcsDD_GRID);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
/* Sort the state on charge group position.
* This enables exact restarts from this step.
dd->ga2la->clear();
ncgindex_set = 0;
- wallcycle_sub_stop(wcycle, ewcsDD_GRID);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
}
else
{
comm->updateGroupsCog->clear();
}
- wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
/* Set the induces for the home atoms */
set_zones_ncg_home(dd);
/* When bSortCG=true, we have already set the size for zone 0 */
set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
- wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
/*
write_dd_pdb("dd_home",step,"dump",top_global,cr,
-1,state_local->x.rvec_array(),state_local->box);
*/
- wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
/* Extract a local topology from the global topology */
IVec numPulses;
top_global,
top_local);
- wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
- wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
/* Set up the special atom communication */
int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
comm->atomRanges.setEnd(range, n);
}
- wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
- wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
/* Make space for the extra coordinates for virtual site
* or constraint communication.
*/
dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
- wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
{
- dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
+ dd_move_x(dd, state_local->box, state_local->x, nullptr);
write_dd_pdb("dd_dump",
step,
"dump",
check_index_consistency(dd, top_global.natoms, "after partitioning");
}
- wallcycle_stop(wcycle, ewcDOMDEC);
+ wallcycle_stop(wcycle, WallCycleCounter::Domdec);
}
} // namespace gmx
}
else
{
- wallcycle_start(wcycle, ewcPME_REDISTXF);
+ wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
do_redist_pos_coeffs(pme, cr, bFirst, coordinates, coefficient);
- wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
}
if (debug)
fprintf(debug, "Rank= %6d, pme local particles=%6d\n", cr->nodeid, atc.numAtoms());
}
- wallcycle_start(wcycle, ewcPME_SPREAD);
+ wallcycle_start(wcycle, WallCycleCounter::PmeSpread);
/* Spread the coefficients on a grid */
spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
- wallcycle_stop(wcycle, ewcPME_SPREAD);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeSpread);
/* TODO If the OpenMP and single-threaded implementations
converge, then spread_on_grid() and
/* do 3d-fft */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_FFT);
+ wallcycle_start(wcycle, WallCycleCounter::PmeFft);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
}
/* solve in k-space for our local cells */
if (thread == 0)
{
- wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ wallcycle_start(
+ wcycle,
+ (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme));
}
if (grid_index < DO_Q)
{
if (thread == 0)
{
- wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+ wallcycle_stop(
+ wcycle,
+ (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme));
inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
}
/* do 3d-invfft */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_FFT);
+ wallcycle_start(wcycle, WallCycleCounter::PmeFft);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
if (pme->nodeid == 0)
/* Note: this wallcycle region is closed below
outside an OpenMP region, so take care if
refactoring code here. */
- wallcycle_start(wcycle, ewcPME_GATHER);
+ wallcycle_start(wcycle, WallCycleCounter::PmeGather);
}
copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
inc_nrnb(nrnb, eNR_GATHERFBSP, pme->pme_order * pme->pme_order * pme->pme_order * atc.numAtoms());
/* Note: this wallcycle region is opened above inside an OpenMP
region, so take care if refactoring code here. */
- wallcycle_stop(wcycle, ewcPME_GATHER);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGather);
}
if (computeEnergyAndVirial)
break;
default: gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
}
- wallcycle_start(wcycle, ewcPME_REDISTXF);
+ wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
do_redist_pos_coeffs(pme, cr, bFirst, coordinates, RedistC6);
pme->lb_buf1.resize(atc.numAtoms());
local_sigma[i] = atc.coefficient[i];
}
- wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
}
atc.coefficient = coefficientBuffer;
calc_initial_lb_coeffs(coefficientBuffer, local_c6, local_sigma);
calc_next_lb_coeffs(coefficientBuffer, local_sigma);
grid = pmegrid->grid.grid;
- wallcycle_start(wcycle, ewcPME_SPREAD);
+ wallcycle_start(wcycle, WallCycleCounter::PmeSpread);
/* Spread the c6 on a grid */
spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
}
copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
}
- wallcycle_stop(wcycle, ewcPME_SPREAD);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeSpread);
/*Here we start a large thread parallel region*/
#pragma omp parallel num_threads(pme->nthread) private(thread)
/* do 3d-fft */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_FFT);
+ wallcycle_start(wcycle, WallCycleCounter::PmeFft);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
}
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
thread = gmx_omp_get_thread_num();
if (thread == 0)
{
- wallcycle_start(wcycle, ewcLJPME);
+ wallcycle_start(wcycle, WallCycleCounter::LJPme);
}
loop_count =
thread);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcLJPME);
+ wallcycle_stop(wcycle, WallCycleCounter::LJPme);
inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
}
}
/* do 3d-invfft */
if (thread == 0)
{
- wallcycle_start(wcycle, ewcPME_FFT);
+ wallcycle_start(wcycle, WallCycleCounter::PmeFft);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle);
if (thread == 0)
{
- wallcycle_stop(wcycle, ewcPME_FFT);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
if (pme->nodeid == 0)
npme = static_cast<int>(ntot * std::log(ntot) / std::log(2.0));
inc_nrnb(nrnb, eNR_FFT, 2 * npme);
}
- wallcycle_start(wcycle, ewcPME_GATHER);
+ wallcycle_start(wcycle, WallCycleCounter::PmeGather);
}
copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
eNR_GATHERFBSP,
pme->pme_order * pme->pme_order * pme->pme_order * pme->atc[0].numAtoms());
}
- wallcycle_stop(wcycle, ewcPME_GATHER);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGather);
bFirst = FALSE;
} /* for (grid_index = 8; grid_index >= 2; --grid_index) */
if (stepWork.computeForces && pme->nnodes > 1)
{
- wallcycle_start(wcycle, ewcPME_REDISTXF);
+ wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
for (d = 0; d < pme->ndecompdim; d++)
{
gmx::ArrayRef<gmx::RVec> forcesRef;
}
}
- wallcycle_stop(wcycle, ewcPME_REDISTXF);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
}
if (computeEnergyAndVirial)
void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t* pme,
const int gridIndex,
enum gmx_fft_direction dir,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
if (pme_gpu_settings(pme->gpu).performGPUFFT)
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
pme_gpu_3dfft(pme->gpu, dir, gridIndex);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
else
{
- wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE);
+ wallcycle_start(wcycle, WallCycleCounter::PmeFftMixedMode);
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
for (int thread = 0; thread < pme->nthread; thread++)
{
gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
}
- wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeFftMixedMode);
}
}
if (stepWork.haveDynamicBox || shouldUpdateBox) // || is to make the first computation always update
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
pme_gpu_update_input_box(pmeGpu, box);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
if (!pme_gpu_settings(pmeGpu).performGPUSolve)
{
/* Spread the coefficients on a grid */
const bool computeSplines = true;
const bool spreadCharges = true;
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork)
const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
if (!settings.performGPUFFT)
{
- wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
+ wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeSpread);
pme_gpu_sync_spread_grid(pme->gpu);
- wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeSpread);
}
try
{
const auto gridOrdering =
settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
else
{
- wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
+ wallcycle_start(wcycle, WallCycleCounter::PmeSolveMixedMode);
#pragma omp parallel for num_threads(pme->nthread) schedule(static)
for (int thread = 0; thread < pme->nthread; thread++)
{
solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread);
}
- wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeSolveMixedMode);
}
parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
return;
}
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
float** fftgrids = pme->fftgrid;
pme_gpu_gather(pme->gpu, fftgrids, lambdaQ);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
//! Accumulate the \c forcesToAdd to \c f, using the available threads.
gmx::ForceWithVirial* forceWithVirial,
gmx_enerdata_t* enerd)
{
- wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
+ wallcycle_start(wcycle, WallCycleCounter::PmeGpuFReduction);
GMX_ASSERT(forceWithVirial, "Invalid force pointer");
if (computeEnergyAndVirial)
{
sum_forces(forceWithVirial->force_, output.forces_);
}
- wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuFReduction);
}
bool pme_gpu_try_finish_task(gmx_pme_t* pme,
// TODO: implement c_streamQuerySupported with an additional GpuEventSynchronizer per stream (#2521)
if ((completionKind == GpuTaskCompletion::Check) && c_streamQuerySupported)
{
- wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuPmeGather);
// Query the PME stream for completion of all tasks enqueued and
// if we're not done, stop the timer before early return.
const bool pmeGpuDone = pme_gpu_stream_query(pme->gpu);
- wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
if (!pmeGpuDone)
{
needToSynchronize = false;
}
- wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
// If the above check passed, then there is no need to make an
// explicit synchronization call.
if (needToSynchronize)
const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
PmeOutput output = pme_gpu_getOutput(
*pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
- wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
"When forces are reduced on the CPU, there needs to be force output");
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
- wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
// Synchronize the whole PME stream at once, including D2H result transfers
// if there are outputs we need to wait for at this step; we still call getOutputs
PmeOutput output = pme_gpu_getOutput(
*pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
- wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
return output;
}
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
pme_gpu_update_timings(pme->gpu);
pme_gpu_clear_grids(pme->gpu);
pme_gpu_clear_energy_virial(pme->gpu);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
t_forcerec* fr,
const matrix box,
gmx::ArrayRef<const gmx::RVec> x,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
int64_t step,
int64_t step_rel,
gmx_bool* bPrinting,
n_prev = pme_lb->cycles_n;
cycles_prev = pme_lb->cycles_c;
- wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c);
+ wallcycle_get(wcycle, WallCycleCounter::Step, &pme_lb->cycles_n, &pme_lb->cycles_c);
/* Before the first step we haven't done any steps yet.
* Also handle cases where ir.init_step % ir.nstlist != 0.
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*
* Process the cycles measured over the last nstlist steps and then
* either continue balancing or check if we need to trigger balancing.
- * Should be called after the ewcSTEP cycle counter has been stopped.
+ * Should be called after the WallCycleCounter::Step cycle counter has been stopped.
* Returns if the load balancing is printing to fp_err.
*/
void pme_loadbal_do(pme_load_balancing_t* pme_lb,
t_forcerec* fr,
const matrix box,
gmx::ArrayRef<const gmx::RVec> x,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
int64_t step,
int64_t step_rel,
gmx_bool* bPrinting,
return pme_pp;
}
-static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting,
t_nrnb* nrnb,
int64_t step,
bool useGpuForPme)
{
/* Reset all the counters related to performance over the run */
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle, WallCycleCounter::Run);
wallcycle_reset_all(wcycle);
*nrnb = { 0 };
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
walltime_accounting_reset_time(walltime_accounting, step);
if (useGpuForPme)
if (count == 0)
{
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
walltime_accounting_start_time(walltime_accounting);
}
- wallcycle_start(wcycle, ewcPMEMESH);
+ wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
dvdlambda_q = 0;
dvdlambda_lj = 0;
output.forces_ = pme_pp->f;
}
- cycles = wallcycle_stop(wcycle, ewcPMEMESH);
+ cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
count++;
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
gmx_wallcycle* wcycle)
{
- wallcycle_start(wcycle, ewcPP_PMESENDX);
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX);
unsigned int flags = PP_PME_COORD;
if (computeEnergyAndVirial)
sendCoordinatesFromGpu,
coordinatesReadyOnDeviceEvent);
- wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
}
void gmx_pme_send_finish(const t_commrec* cr)
time = MPI_Wtime();
}
#else
- wallcycle_start(times, ewcPME_FFTCOMM);
+ wallcycle_start(times, WallCycleCounter::PmeFftComm);
#endif
#ifdef FFT5D_MPI_TRANSPOSE
FFTW(execute)(mpip[s]);
time_mpi[s] = MPI_Wtime() - time;
}
#else
- wallcycle_stop(times, ewcPME_FFTCOMM);
+ wallcycle_stop(times, WallCycleCounter::PmeFftComm);
#endif
} /*master*/
} /* bPrallelDim */
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2009-2017, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
typedef struct fft5d_time_t* fft5d_time;
#else
# include "gromacs/timing/wallcycle.h"
-typedef gmx_wallcycle_t fft5d_time;
+typedef gmx_wallcycle* fft5d_time;
#endif
namespace gmx
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen.
- * Copyright (c) 2013,2014,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup,
enum gmx_fft_direction dir,
int thread,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
if (((pfft_setup->p1->flags & FFT5D_REALCOMPLEX) == 0)
^ (dir == GMX_FFT_FORWARD || dir == GMX_FFT_BACKWARD))
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen.
- * Copyright (c) 2013,2014,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2017,2018,2019,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup,
enum gmx_fft_direction dir,
int thread,
- gmx_wallcycle_t wcycle);
+ gmx_wallcycle* wcycle);
/*! \brief Release all data in parallel fft setup
return false;
}
- wallcycle_start(wcycle, ewcIMD);
+ wallcycle_start(wcycle, WallCycleCounter::Imd);
/* read command from client and check if new incoming connection */
if (MASTER(cr))
}
}
- wallcycle_stop(wcycle, ewcIMD);
+ wallcycle_stop(wcycle, WallCycleCounter::Imd);
return imdstep;
}
return;
}
- wallcycle_start(impl_->wcycle, ewcIMD);
+ wallcycle_start(impl_->wcycle, WallCycleCounter::Imd);
/* Update time step for IMD and prepare IMD energy record if we have new energies. */
fillEnergyRecord(step, bHaveNewEnergies);
sendPositionsAndEnergies();
}
- wallcycle_stop(impl_->wcycle, ewcIMD);
+ wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd);
}
void ImdSession::applyForces(gmx::ArrayRef<gmx::RVec> force)
return;
}
- wallcycle_start(impl_->wcycle, ewcIMD);
+ wallcycle_start(impl_->wcycle, WallCycleCounter::Imd);
for (int i = 0; i < impl_->nforces; i++)
{
rvec_inc(force[j], impl_->f[i]);
}
- wallcycle_stop(impl_->wcycle, ewcIMD);
+ wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd);
}
ImdSession::ImdSession(const MDLogger& mdlog) : impl_(new Impl(mdlog)) {}
GMX_ASSERT(haveInteractions_,
"No GPU bonded interactions, so no energies will be computed, so transfer should "
"not be called");
- wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
+ wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
// TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
float* h_vTot = vTot_.data();
copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
}
void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
"No GPU bonded interactions, so no energies will be computed or transferred, so "
"accumulation should not occur");
- wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
+ wallcycle_start(wcycle_, WallCycleCounter::WaitGpuBonded);
cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
- wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
+ wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuBonded);
for (int fType : fTypesOnGpu)
{
void GpuBonded::Impl::clearEnergies()
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
// ---- GpuBonded
GMX_ASSERT(haveInteractions_,
"Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
"exec_kernel_gpu<calcVir, calcEner>",
kernelArgs);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
{
gmx::ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
- wallcycle_sub_start(wcycle, ewcsLISTED);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::Listed);
/* The dummy array is to have a place to store the dhdl at other values
of lambda, which will be thrown away in the end */
gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl = { 0 };
fcd,
stepWork,
global_atom_index);
- wallcycle_sub_stop(wcycle, ewcsLISTED);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::Listed);
- wallcycle_sub_start(wcycle, ewcsLISTED_BUF_OPS);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedBufOps);
reduce_thread_output(&forceWithShiftForces, enerd->term.data(), &enerd->grpp, dvdl, bt, stepWork);
if (stepWork.computeDhdl)
enerd->dvdl_nonlin[i] += dvdl[i];
}
}
- wallcycle_sub_stop(wcycle, ewcsLISTED_BUF_OPS);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedBufOps);
}
/* Copy the sum of violations for the distance restraints from fcd */
awkward to account to this subtimer properly in the present
code. We don't test / care much about performance with
restraints, anyway. */
- wallcycle_sub_start(wcycle, ewcsRESTRAINTS);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::Restraints);
if (!idef.il[F_POSRES].empty())
{
hist);
}
- wallcycle_sub_stop(wcycle, ewcsRESTRAINTS);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints);
}
calc_listed(wcycle, idef, threading_.get(), x, forceOutputs, fr, pbc, enerd, nrnb, lambda, md, fcdata, global_atom_index, stepWork);
}
if (idef.ilsort != ilsortNO_FE)
{
- wallcycle_sub_start(wcycle, ewcsLISTED_FEP);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedFep);
if (idef.ilsort != ilsortFE_SORTED)
{
gmx_incons("The bonded interactions are not sorted for free energy");
std::fill(std::begin(dvdl), std::end(dvdl), 0.0);
enerd->foreignLambdaTerms.accumulate(i, enerd->foreign_term[F_EPOT], dvdlSum);
}
- wallcycle_sub_stop(wcycle, ewcsLISTED_FEP);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedFep);
}
}
}
gmx::ArrayRef<const real> lambda,
const t_forcerec* fr)
{
- wallcycle_sub_start_nocount(wcycle, ewcsRESTRAINTS);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::Restraints);
auto& foreignTerms = enerd->foreignLambdaTerms;
for (int i = 0; i < 1 + foreignTerms.numLambdas(); i++)
fr->posres_comB);
foreignTerms.accumulate(i, v, dvdl);
}
- wallcycle_sub_stop(wcycle, ewcsRESTRAINTS);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints);
}
/*! \brief Helper function that wraps calls to fbposres for
char buf[22];
int nth;
- wallcycle_start(wcycle, ewcCONSTR);
+ wallcycle_start(wcycle, WallCycleCounter::Constr);
if (econq == ConstraintVariable::ForceDispl && !EI_ENERGY_MINIMIZATION(ir.eI))
{
do_edsam(&ir, step, cr, xprime.unpaddedArrayRef(), v.unpaddedArrayRef(), box, ed);
}
}
- wallcycle_stop(wcycle, ewcCONSTR);
+ wallcycle_stop(wcycle, WallCycleCounter::Constr);
const bool haveVelocities = (!v.empty() || econq == ConstraintVariable::Velocities);
if (haveVelocities && !cFREEZE_.empty())
const t_inputrec& ir,
const t_commrec* cr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const t_mdatoms* md,
gmx::ArrayRef<const RVec> coordinates,
gmx::ForceWithVirial* forceWithVirial,
/* Calculate the Ewald surface force and energy contributions, when necessary */
if (haveEwaldSurfaceTerm)
{
- wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::EwaldCorrection);
int nthreads = fr->nthread_ewc;
#pragma omp parallel for num_threads(nthreads) schedule(static)
{
reduceEwaldThreadOuput(nthreads, fr->ewc_t);
}
- wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::EwaldCorrection);
}
if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
*/
ddBalanceRegionHandler.closeAfterForceComputationCpu();
- wallcycle_start(wcycle, ewcPMEMESH);
+ wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
status = gmx_pme_do(
fr->pmedata,
gmx::constArrayRefFromArray(coordinates.data(), md->homenr - fr->n_tpi),
&ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Coul],
&ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Vdw],
stepWork);
- wallcycle_stop(wcycle, ewcPMEMESH);
+ wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
if (status != 0)
{
gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
completionMarker_ = completionMarker;
cellInfo_.cell = cell.data();
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
reallocateDeviceBuffer(
&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_);
copyToDeviceBuffer(&cellInfo_.d_cell,
deviceStream_,
GpuApiCallBehavior::Async,
nullptr);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
dependencyList_.clear();
};
void GpuForceReduction::Impl::execute()
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
if (numAtoms_ == 0)
{
completionMarker_->markEvent(deviceStream_);
}
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
GpuForceReduction::Impl::~Impl(){};
const t_mdatoms* mdatoms,
t_nrnb* nrnb,
t_vcm* vcm,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_enerdata_t* enerd,
tensor force_vir,
tensor shake_vir,
gmx::ArrayRef<real> signalBuffer = signalCoordinator->getCommunicationBuffer();
if (PAR(cr))
{
- wallcycle_start(wcycle, ewcMoveE);
+ wallcycle_start(wcycle, WallCycleCounter::MoveE);
global_stat(*gstat,
cr,
enerd,
signalBuffer,
*bSumEkinhOld,
flags);
- wallcycle_stop(wcycle, ewcMoveE);
+ wallcycle_stop(wcycle, WallCycleCounter::MoveE);
}
signalCoordinator->finalizeSignals();
*bSumEkinhOld = FALSE;
const t_mdatoms* mdatoms,
t_nrnb* nrnb,
t_vcm* vcm,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_enerdata_t* enerd,
tensor force_vir,
tensor shake_vir,
int natoms_global;
int natoms_x_compressed;
const SimulationGroups* groups; /* for compressed position writing */
- gmx_wallcycle_t wcycle;
+ gmx_wallcycle* wcycle;
rvec* f_global;
gmx::IMDOutputProvider* outputProvider;
const gmx::MDModulesNotifiers* mdModulesNotifiers;
const t_inputrec* ir,
const gmx_mtop_t& top_global,
const gmx_output_env_t* oenv,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const gmx::StartingBehavior startingBehavior,
bool simulationsShareState,
const gmx_multisim_t* ms)
return of->fp_dhdl;
}
-gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of)
+gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of)
{
return of->wcycle;
}
{
if (of->tng || of->tng_low_prec)
{
- wallcycle_start(of->wcycle, ewcTRAJ);
+ wallcycle_start(of->wcycle, WallCycleCounter::Traj);
gmx_tng_close(&of->tng);
gmx_tng_close(&of->tng_low_prec);
- wallcycle_stop(of->wcycle, ewcTRAJ);
+ wallcycle_stop(of->wcycle, WallCycleCounter::Traj);
}
}
const t_inputrec* ir,
const gmx_mtop_t& mtop,
const gmx_output_env_t* oenv,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::StartingBehavior startingBehavior,
bool simulationsShareState,
const gmx_multisim_t* ms);
FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of);
/*! \brief Getter for wallcycle timer */
-gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of);
+gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of);
/*! \brief Close TNG files if they are open.
*
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
bool resetHalfway,
real maximumHoursToRun,
const MDLogger& mdlog,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting) :
signal_(*signal),
rankCanSetSignal_(false),
t_nrnb* nrnb,
const gmx_pme_t* pme,
const pme_load_balancing_t* pme_loadbal,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting)
{
/* Reset either if signal has been passed, or if reset step has been reached */
resetGpuProfiler();
}
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle, WallCycleCounter::Run);
wallcycle_reset_all(wcycle);
if (DOMAINDECOMP(cr))
{
reset_dd_statistics_counters(cr->dd);
}
clear_nrnb(nrnb);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
walltime_accounting_reset_time(walltime_accounting, step);
print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
pull_t* pull_work,
const real* lambda,
double t,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
t_pbc pbc;
real dvdl;
/* Calculate the center of mass forces, this requires communication,
* which is why pull_potential is called close to other communication.
*/
- wallcycle_start(wcycle, ewcPULLPOT);
+ wallcycle_start(wcycle, WallCycleCounter::PullPot);
set_pbc(&pbc, ir.pbcType, box);
dvdl = 0;
enerd->term[F_COM_PULL] +=
force,
&dvdl);
enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
- wallcycle_stop(wcycle, ewcPULLPOT);
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
}
static void pme_receive_force_ener(t_forcerec* fr,
gmx_enerdata_t* enerd,
bool useGpuPmePpComms,
bool receivePmeForceToGpu,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
real e_q, e_lj, dvdl_q, dvdl_lj;
float cycles_ppdpme, cycles_seppme;
- cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+ cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
/* In case of node-splitting, the PP nodes receive the long-range
* forces, virial and energy from the PME nodes here.
*/
- wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
dvdl_q = 0;
dvdl_lj = 0;
gmx_pme_receive_f(fr->pmePpCommGpu.get(),
{
dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
}
- wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
}
static void print_large_forces(FILE* fp,
//! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
static void postProcessForceWithShiftForces(t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const matrix box,
ArrayRef<const RVec> x,
ForceOutputs* forceOutputs,
static void postProcessForces(const t_commrec* cr,
int64_t step,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const matrix box,
ArrayRef<const RVec> x,
ForceOutputs* forceOutputs,
const int clearF,
const int64_t step,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
if (!stepWork.computeNonbondedForces)
{
/* Prune the pair-list beyond fr->ic->rlistPrune using
* the current coordinates of the atoms.
*/
- wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
- wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
}
}
pull_t* pull_work,
int64_t step,
double t,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::ForceProviders* forceProviders,
const matrix box,
gmx::ArrayRef<const gmx::RVec> x,
/* Add the forces from enforced rotation potentials (if any) */
if (inputrec.bRot)
{
- wallcycle_start(wcycle, ewcROTadd);
+ wallcycle_start(wcycle, WallCycleCounter::RotAdd);
enerd->term[F_COM_PULL] +=
add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
- wallcycle_stop(wcycle, ewcROTadd);
+ wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
}
if (ed)
const StepWorkload& stepWork,
GpuEventSynchronizer* xReadyOnDevice,
const real lambdaQ,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
*/
static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
const real lambdaQ,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const gmx::StepWorkload& stepWork)
{
pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
gmx_enerdata_t* enerd,
const real lambdaQ,
const StepWorkload& stepWork,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
bool isPmeGpuDone = false;
bool isNbGpuDone = false;
const DomainLifetimeWorkload& domainWork,
const StepWorkload& stepWork,
const bool havePpDomainDecomposition,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
- wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
/* NOTE: We assume fr->shiftForces is all zeros here */
gmx::ForceWithShiftForces forceWithShiftForces(
clearRVecs(forceWithVirial.force_, true);
}
- wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
return ForceOutputs(
forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
const gmx::MdrunScheduleWorkload& runScheduleWork,
bool useGpuPmeOnThisRank,
int64_t step,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
{
}
/* now clear the GPU outputs while we finish the step on the CPU */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
if (useGpuPmeOnThisRank)
pull_t* pull_work,
int64_t step,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const gmx_localtop_t* top,
const matrix box,
gmx::ArrayRefWithPadding<gmx::RVec> x,
fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
}
- wallcycle_start(wcycle, ewcNS);
+ wallcycle_start(wcycle, WallCycleCounter::NS);
if (!DOMAINDECOMP(cr))
{
const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
- wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
nbnxn_put_on_grid(nbv,
box,
0,
x.unpaddedArrayRef(),
0,
nullptr);
- wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
}
else
{
- wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef());
- wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
}
nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
fr->cginfo);
- wallcycle_stop(wcycle, ewcNS);
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
/* initialize the GPU nbnxm atom data and bonded data structures */
if (simulationWork.useGpuNonbonded)
{
// Note: cycle counting only nononbondeds, gpuBonded counts internally
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
if (fr->gpuBonded)
{
runScheduleWork->domainWork = setupDomainLifetimeWorkload(
inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
- wallcycle_start_nocount(wcycle, ewcNS);
- wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
- wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
- wallcycle_stop(wcycle, ewcNS);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
if (stepWork.useGpuXBufferOps)
{
{
ddBalanceRegionHandler.openBeforeForceComputationGpu();
- wallcycle_start(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
// with X buffer ops offloaded to the GPU on all but the search steps
// bonded work not split into separate local and non-local, so with DD
}
/* launch local nonbonded work on GPU */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
if (useGpuPmeOnThisRank)
if (stepWork.doNeighborSearch)
{
// TODO: fuse this branch with the above large stepWork.doNeighborSearch block
- wallcycle_start_nocount(wcycle, ewcNS);
- wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
- wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- wallcycle_stop(wcycle, ewcNS);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
// TODO refactor this GPU halo exchange re-initialisation
// to location in do_md where GPU halo exchange is
// constructed at partitioning, after above stateGpu
if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
{
- wallcycle_start(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
if (domainWork.haveGpuBondedWork)
}
/* launch non-local nonbonded tasks on GPU */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
}
if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
{
/* launch D2H copy-back F */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
{
fr->gpuBonded->launchEnergyTransfer();
}
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
{
- wallcycle_start(wcycle, ewcPPDURINGPME);
+ wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
dd_force_flop_start(cr->dd, nrnb);
}
if (inputrec.bRot)
{
- wallcycle_start(wcycle, ewcROT);
+ wallcycle_start(wcycle, WallCycleCounter::Rot);
do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
- wallcycle_stop(wcycle, ewcROT);
+ wallcycle_stop(wcycle, WallCycleCounter::Rot);
}
/* Start the force cycle counter.
* Note that a different counter is used for dynamic load balancing.
*/
- wallcycle_start(wcycle, ewcFORCE);
+ wallcycle_start(wcycle, WallCycleCounter::Force);
/* Set up and clear force outputs:
* forceOutMtsLevel0: everything except what is in the other two outputs
* This can be split into a local and a non-local part when overlapping
* communication with calculation with domain decomposition.
*/
- wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
forceOutNonbonded->forceWithShiftForces().force());
- wallcycle_start_nocount(wcycle, ewcFORCE);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
}
/* If there are multiple fshift output buffers we need to reduce them */
// TODO Force flags should include haveFreeEnergyWork for this domain
if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
{
- wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
/* Wait for non-local coordinate data to be copied from device */
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
- wallcycle_start_nocount(wcycle, ewcFORCE);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
}
// Compute wall interactions, when present.
ddBalanceRegionHandler);
}
- wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
// VdW dispersion correction, only computed on master rank to avoid double counting
if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
}
else
{
- wallcycle_start_nocount(wcycle, ewcFORCE);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
do_nb_verlet(
fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
- wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
}
if (stepWork.useGpuFBufferOps)
{
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
- wallcycle_start_nocount(wcycle, ewcFORCE);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
do_nb_verlet(fr,
ic,
enerd,
step,
nrnb,
wcycle);
- wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
}
// If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
if (mdof_flags != 0)
{
- wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
if (bCPT)
{
if (MASTER(cr))
sfree(x_for_confout);
}
}
- wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
}
#if GMX_FAHCORE
if (MASTER(cr))
void finish_update(const t_inputrec& inputRecord,
const t_mdatoms* md,
t_state* state,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
bool haveConstraints);
void update_sd_second_half(const t_inputrec& inputRecord,
t_state* state,
const t_commrec* cr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::Constraints* constr,
bool do_log,
bool do_ene);
void Update::finish_update(const t_inputrec& inputRecord,
const t_mdatoms* md,
t_state* state,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const bool haveConstraints)
{
return impl_->finish_update(inputRecord, md, state, wcycle, haveConstraints);
t_state* state,
const t_commrec* cr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::Constraints* constr,
bool do_log,
bool do_ene)
t_state* state,
const t_commrec* cr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::Constraints* constr,
bool do_log,
bool do_ene)
*/
real dt = inputRecord.delta_t;
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
int nth = gmx_omp_nthreads_get(emntUpdate);
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
inc_nrnb(nrnb, eNR_UPDATE, homenr);
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
/* Constrain the coordinates upd->xp for half a time step */
bool computeVirial = false;
void Update::Impl::finish_update(const t_inputrec& inputRecord,
const t_mdatoms* md,
t_state* state,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const bool haveConstraints)
{
/* NOTE: Currently we always integrate to a temporary buffer and
* then copy the results back here.
*/
- wallcycle_start_nocount(wcycle, ewcUPDATE);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Update);
const int homenr = md->homenr;
auto xp = makeConstArrayRef(xp_).subArray(0, homenr);
}
}
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
}
void Update::Impl::update_coords(const t_inputrec& inputRecord,
void finish_update(const t_inputrec& inputRecord,
const t_mdatoms* md,
t_state* state,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
bool haveConstraints);
/*! \brief Secong part of the SD integrator.
t_state* state,
const t_commrec* cr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx::Constraints* constr,
bool do_log,
bool do_ene);
const float dtPressureCouple,
const matrix prVelocityScalingMatrix)
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
// Clearing virial matrix
// TODO There is no point in having separate virial matrix for constraints
coordinatesReady_->markEvent(deviceStream_);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
return;
}
void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
ScalingMatrix mu(scalingMatrix);
// can affect the performance if nstpcouple is small.
deviceStream_.synchronize();
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
ScalingMatrix mu(scalingMatrix);
// can affect the performance if nstpcouple is small.
deviceStream_.synchronize();
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir,
const t_mdatoms& md)
{
// TODO wallcycle
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
coordinateScalingKernelLaunchConfig_.gridSize[0] =
(numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
/* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
rvec* vbuf = nullptr;
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
if (ir->eI == IntegrationAlgorithm::VV && bInitStep)
{
/* if using velocity verlet with full time step Ekin,
upd->update_coords(
*ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtVELOCITY1, cr, constr != nullptr);
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
/* if VV, compute the pressure and constraints */
/* For VV2, we strictly only need this if using pressure
* control, but we really would like to have accurate pressures
So we need information from the last step in the first half of the integration */
if (bGStat || do_per_step(step - 1, nstglobalcomm))
{
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
int cglo_flags =
((bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
| (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
fplog, vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
}
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
}
/* temperature scaling and pressure scaling to produce the extended variables at t+dt */
if (!bInitStep)
}
else if (bExchanged)
{
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
/* We need the kinetic energy at minus the half step for determining
* the full step kinetic energy and possibly for T-coupling.*/
/* This may not be quite working correctly yet . . . . */
state->box,
bSumEkinhOld,
CGLO_GSTAT | CGLO_TEMPERATURE);
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
}
}
/* if it's the initial step, we performed this first step just to get the constraint virial */
copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
sfree(vbuf);
}
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
}
/* compute the conserved quantity */
upd->update_coords(
*ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
constrain_coordinates(
constr, do_log, do_ene, step, state, upd->xp()->arrayRefWithPadding(), dvdl_constr, bCalcVir, shake_vir);
lastbox,
bSumEkinhOld,
(bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, MassQ, trotter_seq, ettTSEQ4);
/* now we know the scaling, we can compute the positions again */
std::copy(cbuf->begin(), cbuf->end(), state->x.begin());
upd->update_coords(
*ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
/* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
/* are the small terms in the shake_vir here due
const matrix box,
gmx_wallcycle* wcycle)
{
- wallcycle_start(wcycle, ewcVSITESPREAD);
+ wallcycle_start(wcycle, WallCycleCounter::VsiteSpread);
const bool useDomdec = domainInfo_.useDomdec();
inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(ilists_, F_VSITE4FDN));
inc_nrnb(nrnb, eNR_VSITEN, vsite_count(ilists_, F_VSITEN));
- wallcycle_stop(wcycle, ewcVSITESPREAD);
+ wallcycle_stop(wcycle, WallCycleCounter::VsiteSpread);
}
/*! \brief Returns the an array with group indices for each atom
}
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, "mdrun");
/***********************************************************
simulationWork.useGpuPmePpCommunication);
}
- wallcycle_start(wcycle, ewcSTEP);
+ wallcycle_start(wcycle, WallCycleCounter::Step);
bLastStep = (step_rel == ir->nsteps);
t = t0 + step * ir->delta_t;
if (vsite != nullptr)
{
// Virtual sites need to be updated before domain decomposition and forces are calculated
- wallcycle_start(wcycle, ewcVSITECONSTR);
+ wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
// md-vv calculates virtual velocities once it has full-step real velocities
vsite->construct(state->x,
state->v,
(!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
? VSiteOperation::PositionsAndVelocities
: VSiteOperation::Positions);
- wallcycle_stop(wcycle, ewcVSITECONSTR);
+ wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
}
if (bNS && !(bFirstStep && ir->bContinuation))
if (vsite != nullptr && needVirtualVelocitiesThisStep)
{
// Positions were calculated earlier
- wallcycle_start(wcycle, ewcVSITECONSTR);
+ wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
- wallcycle_stop(wcycle, ewcVSITECONSTR);
+ wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
}
}
if (!useGpuForUpdate)
{
- wallcycle_start(wcycle, ewcUPDATE);
+ wallcycle_start(wcycle, WallCycleCounter::Update);
}
/* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
if (bTrotter)
upd.update_coords(
*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
- wallcycle_stop(wcycle, ewcUPDATE);
+ wallcycle_stop(wcycle, WallCycleCounter::Update);
constrain_coordinates(constr,
do_log,
rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
}
- cycles = wallcycle_stop(wcycle, ewcSTEP);
+ cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
if (DOMAINDECOMP(cr) && wcycle)
{
dd_cycles_add(cr->dd, cycles, ddCyclStep);
}
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, "mdrun");
/***********************************************************
while (!isLastStep)
{
isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel == ir->nsteps));
- wallcycle_start(wcycle, ewcSTEP);
+ wallcycle_start(wcycle, WallCycleCounter::Step);
t = step;
}
if (constructVsites)
{
- wallcycle_start(wcycle, ewcVSITECONSTR);
+ wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
vsite->construct(state->x, state->v, state->box, VSiteOperation::PositionsAndVelocities);
- wallcycle_stop(wcycle, ewcVSITECONSTR);
+ wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
}
}
print_time(stderr, walltime_accounting, step, ir, cr);
}
- cycles = wallcycle_stop(wcycle, ewcSTEP);
+ cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
if (DOMAINDECOMP(cr) && wcycle)
{
dd_cycles_add(cr->dd, cycles, ddCyclStep);
static void print_em_start(FILE* fplog,
const t_commrec* cr,
gmx_walltime_accounting_t walltime_accounting,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
const char* name)
{
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, name);
}
//! Stop counting time for EM
-static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle_t wcycle)
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle* wcycle)
{
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle, WallCycleCounter::Run);
walltime_accounting_end_time(walltime_accounting);
}
static void finish_em(const t_commrec* cr,
gmx_mdoutf_t outf,
gmx_walltime_accounting_t walltime_accounting,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
if (!thisRankHasDuty(cr, DUTY_PME))
{
VirtualSitesHandler* vsite,
gmx::Constraints* constr,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle* wcycle)
{
/* Repartition the domain decomposition */
dd_partition_system(fplog,
//! Manages flop accounting.
t_nrnb* nrnb;
//! Manages wall cycle accounting.
- gmx_wallcycle_t wcycle;
+ gmx_wallcycle* wcycle;
//! Coordinates global reduction.
gmx_global_stat_t gstat;
//! Handles virtual sites.
/* Communicate stuff when parallel */
if (PAR(cr) && inputrec->eI != IntegrationAlgorithm::NM)
{
- wallcycle_start(wcycle, ewcMoveE);
+ wallcycle_start(wcycle, WallCycleCounter::MoveE);
global_stat(*gstat,
cr,
FALSE,
CGLO_ENERGY | CGLO_PRESSURE | CGLO_CONSTRAINT);
- wallcycle_stop(wcycle, ewcMoveE);
+ wallcycle_stop(wcycle, WallCycleCounter::MoveE);
}
if (fr->dispersionCorrection)
}
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, "mdrun");
/***********************************************************
isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
while (!isLastStep)
{
- wallcycle_start(wcycle, ewcSTEP);
+ wallcycle_start(wcycle, WallCycleCounter::Step);
if (rerun_fr.bStep)
{
rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
}
- cycles = wallcycle_stop(wcycle, ewcSTEP);
+ cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
if (DOMAINDECOMP(cr) && wcycle)
{
dd_cycles_add(cr->dd, cycles, ddCyclStep);
const t_commrec* cr,
const t_inputrec& inputrec,
t_nrnb nrnb[],
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t* nbv,
const gmx_pme_t* pme,
real ewaldcoeff_q = 0;
real ewaldcoeff_lj = 0;
int nChargePerturbed = -1, nTypePerturbed = 0;
- gmx_wallcycle_t wcycle;
gmx_walltime_accounting_t walltime_accounting = nullptr;
MembedHolder membedHolder(filenames.size(), filenames.data());
"The -resetstep functionality is deprecated, and may be removed in a "
"future version.");
}
- wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+ std::unique_ptr<gmx_wallcycle> wcycle =
+ wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
if (PAR(cr))
{
/* Master synchronizes its value of reset_counters with all nodes
* including PME only nodes */
- int64_t reset_counters = wcycle_get_reset_counters(wcycle);
+ int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
- wcycle_set_reset_counters(wcycle, reset_counters);
+ wcycle_set_reset_counters(wcycle.get(), reset_counters);
}
// Membrane embedding must be initialized before we call init_forcerec()
deviceStreamManager.get(),
mtop,
box,
- wcycle);
+ wcycle.get());
// TODO: Move the logic below to a GPU bonded builder
if (runScheduleWork.simulationWork.useGpuBonded)
{
fr->ic->epsfac * fr->fudgeQQ,
deviceStreamManager->context(),
deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
- wcycle);
+ wcycle.get());
fr->gpuBonded = gpuBonded.get();
}
/* Let makeConstraints know whether we have essential dynamics constraints. */
auto constr = makeConstraints(
- mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle, fr->bMolPBC);
+ mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle.get(), fr->bMolPBC);
/* Energy terms and groups */
gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
/* Set up interactive MD (IMD) */
auto imdSession = makeImdSession(inputrec.get(),
cr,
- wcycle,
+ wcycle.get(),
&enerd,
ms,
mtop,
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
- wcycle);
+ wcycle.get());
fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
- wcycle);
+ wcycle.get());
}
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
"GPU device stream manager should be initialized to use GPU.");
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
+ *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
fr->stateGpu = stateGpu.get();
}
simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
- simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
+ simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
simulatorBuilder.add(ConstraintsParam(
constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
// TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
gmx_pmeonly(pmedata,
cr,
&nrnb,
- wcycle,
+ wcycle.get(),
walltime_accounting,
inputrec.get(),
pmeRunMode,
deviceStreamManager.get());
}
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
/* Finish up, write some stuff
* if rerunMD, don't write last frame again
cr,
*inputrec,
&nrnb,
- wcycle,
+ wcycle.get(),
walltime_accounting,
fr ? fr->nbv.get() : nullptr,
pmedata,
EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
- // clean up cycle counter
- wallcycle_destroy(wcycle);
deviceStreamManager.reset(nullptr);
// Free PME data
tensor force_vir,
const t_mdatoms& md,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_shellfc_t* shfc,
t_forcerec* fr,
gmx::MdrunScheduleWorkload* runScheduleWork,
tensor force_vir,
const t_mdatoms& md,
t_nrnb* nrnb,
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_shellfc_t* shfc,
t_forcerec* fr,
gmx::MdrunScheduleWorkload* runScheduleWork,
/* Print to log file */
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, "Test Particle Insertion");
/* The last charge group is the group to be inserted */
void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
numAtomsLocal_ = numAtomsLocal;
numAtomsAll_ = numAtomsAll;
clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
}
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
}
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
GpuEventSynchronizer*
void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
{
- wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
xReadyOnDevice_[atomLocality].waitForEvent();
- wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
// Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
xReadyOnHost_[atomLocality].markEvent(*deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
{
- wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
xReadyOnHost_[atomLocality].waitForEvent();
- wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
}
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
vReadyOnHost_[atomLocality].markEvent(*deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
{
- wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
vReadyOnHost_[atomLocality].waitForEvent();
- wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
}
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for clearing forces with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
fReadyOnHost_[atomLocality].markEvent(*deviceStream);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
{
- wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
fReadyOnHost_[atomLocality].waitForEvent();
- wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
}
const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
NumVelocityScalingValues numEndVelocityScalingValues>
void Propagator<IntegrationStep::PositionsOnly>::run()
{
- wallcycle_start(wcycle_, ewcUPDATE);
+ wallcycle_start(wcycle_, WallCycleCounter::Update);
auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
- wallcycle_stop(wcycle_, ewcUPDATE);
+ wallcycle_stop(wcycle_, WallCycleCounter::Update);
}
//! Propagation (velocity only)
NumVelocityScalingValues numEndVelocityScalingValues>
void Propagator<IntegrationStep::VelocitiesOnly>::run()
{
- wallcycle_start(wcycle_, ewcUPDATE);
+ wallcycle_start(wcycle_, WallCycleCounter::Update);
auto v = as_rvec_array(statePropagatorData_->velocitiesView().paddedArrayRef().data());
auto f = as_rvec_array(statePropagatorData_->constForcesView().force().data());
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
- wallcycle_stop(wcycle_, ewcUPDATE);
+ wallcycle_stop(wcycle_, WallCycleCounter::Update);
}
//! Propagation (leapfrog case - position and velocity)
NumVelocityScalingValues numEndVelocityScalingValues>
void Propagator<IntegrationStep::LeapFrog>::run()
{
- wallcycle_start(wcycle_, ewcUPDATE);
+ wallcycle_start(wcycle_, WallCycleCounter::Update);
auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
- wallcycle_stop(wcycle_, ewcUPDATE);
+ wallcycle_stop(wcycle_, WallCycleCounter::Update);
}
//! Propagation (velocity verlet stage 2 - velocity and position)
NumVelocityScalingValues numEndVelocityScalingValues>
void Propagator<IntegrationStep::VelocityVerletPositionsAndVelocities>::run()
{
- wallcycle_start(wcycle_, ewcUPDATE);
+ wallcycle_start(wcycle_, WallCycleCounter::Update);
auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
auto x = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
- wallcycle_stop(wcycle_, ewcUPDATE);
+ wallcycle_stop(wcycle_, WallCycleCounter::Update);
}
template<IntegrationStep algorithm>
}
walltime_accounting_start_time(walltime_accounting);
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
print_start(fplog, cr, walltime_accounting, "mdrun");
step_ = inputrec->init_step;
stophandlerCurrentStep_ = step;
stopHandler_->setSignal();
- wallcycle_start(wcycle, ewcSTEP);
+ wallcycle_start(wcycle, WallCycleCounter::Step);
}
void ModularSimulatorAlgorithm::postStep(Step step, Time gmx_unused time)
print_time(stderr, walltime_accounting, step, inputrec, cr);
}
- double cycles = wallcycle_stop(wcycle, ewcSTEP);
+ double cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
if (DOMAINDECOMP(cr) && wcycle)
{
dd_cycles_add(cr->dd, static_cast<float>(cycles), ddCyclStep);
void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Time currentTime)
{
- wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
unsigned int mdof_flags = 0;
if (do_per_step(currentStep, nstxout_))
{
if (mdof_flags == 0)
{
- wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
return;
}
GMX_ASSERT(localStateBackup_, "Trajectory writing called, but no state saved.");
{
localStateBackup_.reset();
}
- wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
}
void StatePropagatorData::Element::elementSetup()
GMX_ASSERT(localStateBackup_, "Final trajectory writing called, but no state saved.");
- wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
if (DOMAINDECOMP(cr_))
{
auto globalXRef =
pbcType_,
localStateBackup_->box);
}
- wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+ wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
}
std::optional<SignallerCallback> StatePropagatorData::Element::registerLastStepCallback()
// we start without counting and only when the task finished we issue a
// start/stop to increment.
// GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
- wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
{
- wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
// Early return to skip the steps below that we have to do only
// after the NB task completed
return false;
}
- wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
}
else if (haveResultToWaitFor)
{
gmx_wallcycle* wcycle)
{
auto cycleCounter = (atomToInteractionLocality(aloc) == InteractionLocality::Local)
- ? ewcWAIT_GPU_NB_L
- : ewcWAIT_GPU_NB_NL;
+ ? WallCycleCounter::WaitGpuNbL
+ : WallCycleCounter::WaitGpuNbNL;
wallcycle_start(wcycle, cycleCounter);
gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
auto shiftVecPointer = as_rvec_array(shiftVectors.data());
int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
- wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedClear);
#pragma omp parallel for schedule(static) num_threads(nthreads)
for (gmx::index nb = 0; nb < pairlists.ssize(); nb++)
{
if (nb == 0)
{
- wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR);
- wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedClear);
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedKernel);
}
// TODO: Change to reference
}
}
}
- wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedKernel);
if (stepWork.computeEnergy)
{
GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(),
"Number of lists should be same as number of NB threads");
- wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::NonbondedFep);
#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
{
+ dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]);
}
}
- wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NonbondedFep);
}
void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality locality,
gmx::ArrayRef<const gmx::RVec> coordinates)
{
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
+ wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBXBufOps);
nbnxn_atomdata_copy_x_to_nbat_x(
pairSearch_->gridSet(), locality, as_rvec_array(coordinates.data()), nbat.get());
- wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBXBufOps);
+ wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps);
}
void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
DeviceBuffer<gmx::RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice)
{
- wallcycle_start(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
+ wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, gpu_nbv, d_x, xReadyOnDevice);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
return;
}
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps);
+ wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBFBufOps);
reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()));
- wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBFBufOps);
+ wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps);
}
int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
{
- wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+ wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded);
const bool stepIsEven =
(pairlistSets().numStepsWithPairlist(step) % (2 * pairlistSets().params().mtsFactor) == 0);
stepIsEven ? gmx::InteractionLocality::Local : gmx::InteractionLocality::NonLocal,
pairlistSets().params().numRollingPruningParts);
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded);
+ wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
rvec com_solvent, com_particle; /* solvent and swap molecule's center of mass */
- wallcycle_start(wcycle, ewcSWAP);
+ wallcycle_start(wcycle, WallCycleCounter::Swap);
set_pbc(s->pbc, ir->pbcType, box);
} /* end of if(bSwap) */
- wallcycle_stop(wcycle, ewcSWAP);
+ wallcycle_stop(wcycle, WallCycleCounter::Swap);
return bSwap;
}
#include "config.h"
#include <chrono>
+#include <memory>
#include <thread>
#include "gromacs/timing/cyclecounter.h"
{
public:
TimingTest() : wcycle(wallcycle_init(nullptr, 0, nullptr)) {}
- ~TimingTest() override { wallcycle_destroy(wcycle); }
protected:
- const int delayInMilliseconds = 1;
- gmx_wallcycle_t wcycle;
+ const int delayInMilliseconds = 1;
+ std::unique_ptr<gmx_wallcycle> wcycle;
};
//! Test whether the we can run the cycle counter.
TEST_F(TimingTest, RunWallCycle)
{
- int probe = 0, ref = 1;
- int n1, n2;
- double c1, c2;
+ WallCycleCounter probe = WallCycleCounter::Run, ref = WallCycleCounter::Step;
+ int n1, n2;
+ double c1, c2;
//! credit cycles from enclosing call to the ref field of wcycle
- wallcycle_start(wcycle, ref);
+ wallcycle_start(wcycle.get(), ref);
//! cycles from the probe call
- wallcycle_start(wcycle, probe);
+ wallcycle_start(wcycle.get(), probe);
sleepForMilliseconds(delayInMilliseconds);
- wallcycle_stop(wcycle, probe);
- wallcycle_stop(wcycle, ref);
+ wallcycle_stop(wcycle.get(), probe);
+ wallcycle_stop(wcycle.get(), ref);
//! extract both
- wallcycle_get(wcycle, probe, &n1, &c1);
- wallcycle_get(wcycle, ref, &n2, &c2);
+ wallcycle_get(wcycle.get(), probe, &n1, &c1);
+ wallcycle_get(wcycle.get(), ref, &n2, &c2);
EXPECT_EQ(n1, n2);
EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3));
{
if (useCycleSubcounters)
{
- int probe = 0;
- int ref = 1;
- int n1, n2;
- double c1, c2;
- wallcycle_sub_start(wcycle, ref);
- wallcycle_sub_start(wcycle, probe);
+ WallCycleSubCounter probe = WallCycleSubCounter::DDRedist;
+ WallCycleSubCounter ref = WallCycleSubCounter::DDGrid;
+ int n1, n2;
+ double c1, c2;
+ wallcycle_sub_start(wcycle.get(), ref);
+ wallcycle_sub_start(wcycle.get(), probe);
sleepForMilliseconds(delayInMilliseconds);
- wallcycle_sub_stop(wcycle, probe);
- wallcycle_sub_stop(wcycle, ref);
- wallcycle_sub_get(wcycle, probe, &n1, &c1);
- wallcycle_sub_get(wcycle, ref, &n2, &c2);
+ wallcycle_sub_stop(wcycle.get(), probe);
+ wallcycle_sub_stop(wcycle.get(), ref);
+ wallcycle_sub_get(wcycle.get(), probe, &n1, &c1);
+ wallcycle_sub_get(wcycle.get(), ref, &n2, &c2);
EXPECT_EQ(n1, n2);
EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3));
#include <cstdlib>
#include <array>
+#include <memory>
#include <vector>
#include "gromacs/math/functions.h"
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/timing/wallcyclereporting.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
//! Whether wallcycle debugging is enabled
constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
/* Each name should not exceed 19 printing characters
(ie. terminating null can be twentieth) */
-static const char* wcn[ewcNR] = { "Run",
- "Step",
- "PP during PME",
- "Domain decomp.",
- "DD comm. load",
- "DD comm. bounds",
- "Vsite constr.",
- "Send X to PME",
- "Neighbor search",
- "Launch GPU ops.",
- "Comm. coord.",
- "Force",
- "Wait + Comm. F",
- "PME mesh",
- "PME redist. X/F",
- "PME spread",
- "PME gather",
- "PME 3D-FFT",
- "PME 3D-FFT Comm.",
- "PME solve LJ",
- "PME solve Elec",
- "PME wait for PP",
- "Wait + Recv. PME F",
- "Wait PME GPU spread",
- "PME 3D-FFT",
- "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
- "Wait PME GPU gather",
- "Wait Bonded GPU",
- "Reduce GPU PME F",
- "Wait GPU NB nonloc.",
- "Wait GPU NB local",
- "Wait GPU state copy",
- "NB X/F buffer ops.",
- "Vsite spread",
- "COM pull force",
- "AWH",
- "Write traj.",
- "Update",
- "Constraints",
- "Comm. energies",
- "Enforced rotation",
- "Add rot. forces",
- "Position swapping",
- "IMD",
- "Test" };
-
-static const char* wcsn[ewcsNR] = {
- "DD redist.",
- "DD NS grid + sort",
- "DD setup comm.",
- "DD make top.",
- "DD make constr.",
- "DD top. other",
- "DD GPU ops.",
- "NS grid local",
- "NS grid non-loc.",
- "NS search local",
- "NS search non-loc.",
- "Bonded F",
- "Bonded-FEP F",
- "Restraints F",
- "Listed buffer ops.",
- "Nonbonded pruning",
- "Nonbonded F kernel",
- "Nonbonded F clear",
- "Nonbonded FEP",
- "Launch NB GPU tasks",
- "Launch Bonded GPU tasks",
- "Launch PME GPU tasks",
- "Launch state copy",
- "Ewald F correction",
- "NB X buffer ops.",
- "NB F buffer ops.",
- "Clear force buffer",
- "Launch GPU NB X buffer ops.",
- "Launch GPU NB F buffer ops.",
- "Launch GPU Comm. coord.",
- "Launch GPU Comm. force.",
- "Launch GPU update",
- "Test subcounter",
-};
+static const char* enumValuetoString(WallCycleCounter enumValue)
+{
+ constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
+ "Run",
+ "Step",
+ "PP during PME",
+ "Domain decomp.",
+ "DD comm. load",
+ "DD comm. bounds",
+ "Vsite constr.",
+ "Send X to PME",
+ "Neighbor search",
+ "Launch GPU ops.",
+ "Comm. coord.",
+ "Force",
+ "Wait + Comm. F",
+ "PME mesh",
+ "PME redist. X/F",
+ "PME spread",
+ "PME gather",
+ "PME 3D-FFT",
+ "PME 3D-FFT Comm.",
+ "PME solve LJ",
+ "PME solve Elec",
+ "PME wait for PP",
+ "Wait + Recv. PME F",
+ "Wait PME GPU spread",
+ "PME 3D-FFT",
+ "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
+ "Wait PME GPU gather",
+ "Wait Bonded GPU",
+ "Reduce GPU PME F",
+ "Wait GPU NB nonloc.",
+ "Wait GPU NB local",
+ "Wait GPU state copy",
+ "NB X/F buffer ops.",
+ "Vsite spread",
+ "COM pull force",
+ "AWH",
+ "Write traj.",
+ "Update",
+ "Constraints",
+ "Comm. energies",
+ "Enforced rotation",
+ "Add rot. forces",
+ "Position swapping",
+ "IMD",
+ "Test"
+ };
+ return wallCycleCounterNames[enumValue];
+}
+
+static const char* enumValuetoString(WallCycleSubCounter enumValue)
+{
+ constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
+ "DD redist.",
+ "DD NS grid + sort",
+ "DD setup comm.",
+ "DD make top.",
+ "DD make constr.",
+ "DD top. other",
+ "DD GPU ops.",
+ "NS grid local",
+ "NS grid non-loc.",
+ "NS search local",
+ "NS search non-loc.",
+ "Bonded F",
+ "Bonded-FEP F",
+ "Restraints F",
+ "Listed buffer ops.",
+ "Nonbonded pruning",
+ "Nonbonded F kernel",
+ "Nonbonded F clear",
+ "Nonbonded FEP",
+ "Launch NB GPU tasks",
+ "Launch Bonded GPU tasks",
+ "Launch PME GPU tasks",
+ "Launch state copy",
+ "Ewald F correction",
+ "NB X buffer ops.",
+ "NB F buffer ops.",
+ "Clear force buffer",
+ "Launch GPU NB X buffer ops.",
+ "Launch GPU NB F buffer ops.",
+ "Launch GPU Comm. coord.",
+ "Launch GPU Comm. force.",
+ "Launch GPU update",
+ "Test subcounter"
+ };
+ return wallCycleSubCounterNames[enumValue];
+}
/* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
static const char* enumValuetoString(PmeStage enumValue)
return gmx_cycles_have_counter();
}
-gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
{
- gmx_wallcycle_t wc;
+ std::unique_ptr<gmx_wallcycle> wc;
if (!wallcycle_have_counter())
{
- return nullptr;
+ return wc;
}
- snew(wc, 1);
+ wc = std::make_unique<gmx_wallcycle>();
- wc->haveInvalidCount = FALSE;
- wc->wc_barrier = FALSE;
- wc->wcc_all = nullptr;
+ wc->haveInvalidCount = false;
+ wc->wc_barrier = false;
wc->wc_depth = 0;
- wc->ewc_prev = -1;
+ wc->ewc_prev = WallCycleCounter::Count;
wc->reset_counters = resetstep;
wc->cr = cr;
{
fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
}
- wc->wc_barrier = TRUE;
+ wc->wc_barrier = true;
}
#endif
- snew(wc->wcc, ewcNR);
if (getenv("GMX_CYCLE_ALL") != nullptr)
{
if (fplog)
{
fprintf(fplog, "\nWill time all the code during the run\n\n");
}
- snew(wc->wcc_all, ewcNR * ewcNR);
- }
-
- if (sc_useCycleSubcounters)
- {
- snew(wc->wcsc, ewcsNR);
+ wc->wcc_all.resize(sc_numWallCycleCountersSquared);
}
#if DEBUG_WCYCLE
return wc;
}
-void wallcycle_destroy(gmx_wallcycle_t wc)
-{
- if (wc == nullptr)
- {
- return;
- }
-
- if (wc->wcc != nullptr)
- {
- sfree(wc->wcc);
- }
- if (wc->wcc_all != nullptr)
- {
- sfree(wc->wcc_all);
- }
- if (wc->wcsc != nullptr)
- {
- sfree(wc->wcsc);
- }
- sfree(wc);
-}
-
#if DEBUG_WCYCLE
-static void debug_start_check(gmx_wallcycle_t wc, int ewc)
+static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
{
- if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
+ if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
{
gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
}
if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
{
std::string indentStr(4 * wc->count_depth, ' ');
- fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+ fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
}
}
-static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
+static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
{
if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
{
std::string indentStr(4 * wc->count_depth, ' ');
- fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+ fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
}
wc->count_depth--;
if (wc->count_depth < 0)
{
- gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
+ gmx_fatal(FARGS,
+ "wallcycle counter depth out of range when stopping %s: %d",
+ enumValuetoString(ewc),
+ wc->count_depth);
}
if (wc->counterlist[wc->count_depth] != ewc)
{
gmx_fatal(FARGS,
"wallcycle mismatch at stop, start %s, stop %s",
- wcn[wc->counterlist[wc->count_depth]],
- wcn[ewc]);
+ enumValuetoString(wc->counterlist[wc->count_depth]),
+ enumValuetoString(ewc));
}
}
#endif
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
{
*n = wc->wcc[ewc].n;
*c = static_cast<double>(wc->wcc[ewc].c);
}
-void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
{
if (sc_useCycleSubcounters && wc != nullptr)
{
}
}
-void wallcycle_reset_all(gmx_wallcycle_t wc)
+void wallcycle_reset_all(gmx_wallcycle* wc)
{
- int i;
-
if (wc == nullptr)
{
return;
}
- for (i = 0; i < ewcNR; i++)
+ for (auto& counter : wc->wcc)
{
- wc->wcc[i].n = 0;
- wc->wcc[i].c = 0;
+ counter.n = 0;
+ counter.c = 0;
}
- wc->haveInvalidCount = FALSE;
+ wc->haveInvalidCount = false;
- if (wc->wcc_all)
+ if (!wc->wcc_all.empty())
{
- for (i = 0; i < ewcNR * ewcNR; i++)
+ for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
{
wc->wcc_all[i].n = 0;
wc->wcc_all[i].c = 0;
}
}
- if (wc->wcsc)
+ for (auto& counter : wc->wcsc)
{
- for (i = 0; i < ewcsNR; i++)
- {
- wc->wcsc[i].n = 0;
- wc->wcsc[i].c = 0;
- }
+ counter.n = 0;
+ counter.c = 0;
}
}
-static gmx_bool is_pme_counter(int ewc)
+static bool is_pme_counter(WallCycleCounter ewc)
{
- return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
+ return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
}
-static gmx_bool is_pme_subcounter(int ewc)
+static bool is_pme_subcounter(WallCycleCounter ewc)
{
- return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
+ return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
}
void wallcycleBarrier(gmx_wallcycle* wc)
}
/* Subtract counter ewc_sub timed inside a timing block for ewc_main */
-static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
+// NOLINTNEXTLINE(google-runtime-references)
+static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
+ WallCycleCounter ewc_main,
+ WallCycleCounter ewc_sub)
{
if (wcc[ewc_sub].n > 0)
{
}
}
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
{
if (wc == nullptr)
{
return;
}
- for (int i = 0; i < ewcNR; i++)
+ for (auto key : keysOf(wc->wcc))
{
- if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
+ if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
{
- wc->wcc[i].c *= nthreads_pme;
+ wc->wcc[key].c *= nthreads_pme;
- if (wc->wcc_all)
+ if (!wc->wcc_all.empty())
{
- for (int j = 0; j < ewcNR; j++)
+ const int current = static_cast<int>(key);
+ for (int j = 0; j < sc_numWallCycleCounters; j++)
{
- wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
+ wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
}
}
}
else
{
- wc->wcc[i].c *= nthreads_pp;
+ wc->wcc[key].c *= nthreads_pp;
- if (wc->wcc_all)
+ if (!wc->wcc_all.empty())
{
- for (int j = 0; j < ewcNR; j++)
+ const int current = static_cast<int>(key);
+ for (int j = 0; j < sc_numWallCycleCounters; j++)
{
- wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
+ wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
}
}
}
}
- if (sc_useCycleSubcounters && wc->wcsc && !isPmeRank)
+ if (sc_useCycleSubcounters && !isPmeRank)
{
- for (int i = 0; i < ewcsNR; i++)
+ for (auto counter : wc->wcsc)
{
- wc->wcsc[i].c *= nthreads_pp;
+ counter.c *= nthreads_pp;
}
}
}
* wcc_all are unused by the GPU reporting, but it is not satisfactory
* for the future. Also, there's no need for MPI_Allreduce, since
* only MASTERRANK uses any of the results. */
-WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
{
- WallcycleCounts cycles_sum;
- wallcc_t* wcc;
- double cycles[int(ewcNR) + int(ewcsNR)];
+ WallcycleCounts cycles_sum;
+ gmx::EnumerationArray<WallCycleCounter, double> cyclesMain;
+ gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
#if GMX_MPI
- double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
+ gmx::EnumerationArray<WallCycleCounter, double> cyclesMainOnNode;
+ gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
#endif
- int i;
- int nsum;
if (wc == nullptr)
{
return cycles_sum;
}
- wcc = wc->wcc;
+ auto& wcc = wc->wcc;
- subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
- subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
+ subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
+ subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
- subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
+ subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
if (cr->npmenodes == 0)
{
/* All nodes do PME (or no PME at all) */
- subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
+ subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
}
else
{
/* The are PME-only nodes */
- if (wcc[ewcPMEMESH].n > 0)
+ if (wcc[WallCycleCounter::PmeMesh].n > 0)
{
/* This must be a PME only node, calculate the Wait + Comm. time */
- GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
+ GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
"Total run ticks must be greater than PME-only ticks");
- wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
+ wcc[WallCycleCounter::PmeWaitComm].c =
+ wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
}
}
/* Store the cycles in a double buffer for summing */
- for (i = 0; i < ewcNR; i++)
+ for (auto key : keysOf(wcc))
{
#if GMX_MPI
- cycles_n[i] = static_cast<double>(wcc[i].n);
+ cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
#endif
- cycles[i] = static_cast<double>(wcc[i].c);
+ cyclesMain[key] = static_cast<double>(wcc[key].c);
}
- nsum = ewcNR;
- if (wc->wcsc)
+ if (sc_useCycleSubcounters)
{
- for (i = 0; i < ewcsNR; i++)
+ for (auto key : keysOf(wc->wcsc))
{
#if GMX_MPI
- cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
+ cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
#endif
- cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
+ cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
}
- nsum += ewcsNR;
}
#if GMX_MPI
if (cr->nnodes > 1)
{
- double buf[int(ewcNR) + int(ewcsNR) + 1];
+ gmx::EnumerationArray<WallCycleCounter, double> bufMain;
+ gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
// TODO this code is used only at the end of the run, so we
// can just do a simple reduce of haveInvalidCount in
// wallcycle_print, and avoid bugs
- cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
+ double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
// TODO Use MPI_Reduce
- MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
- for (i = 0; i < ewcNR; i++)
+ MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+ if (sc_useCycleSubcounters)
+ {
+ MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+ }
+ MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+ for (auto key : keysOf(wcc))
{
- wcc[i].n = gmx::roundToInt(buf[i]);
+ wcc[key].n = gmx::roundToInt(bufMain[key]);
}
- wc->haveInvalidCount = (buf[nsum] > 0);
- if (wc->wcsc)
+ wc->haveInvalidCount = (haveInvalidCount > 0);
+ if (sc_useCycleSubcounters)
{
- for (i = 0; i < ewcsNR; i++)
+ for (auto key : keysOf(wc->wcsc))
{
- wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
+ wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
}
}
// TODO Use MPI_Reduce
- MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+ MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+ if (sc_useCycleSubcounters)
+ {
+ MPI_Allreduce(cyclesSub.data(),
+ cycles_sum.data() + sc_numWallCycleCounters,
+ cyclesSub.size(),
+ MPI_DOUBLE,
+ MPI_SUM,
+ cr->mpi_comm_mysim);
+ }
- if (wc->wcc_all != nullptr)
+ if (!wc->wcc_all.empty())
{
- double *buf_all, *cyc_all;
+ std::array<double, sc_numWallCycleCountersSquared> cyc_all;
+ std::array<double, sc_numWallCycleCountersSquared> buf_all;
- snew(cyc_all, ewcNR * ewcNR);
- snew(buf_all, ewcNR * ewcNR);
- for (i = 0; i < ewcNR * ewcNR; i++)
+ for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
{
cyc_all[i] = wc->wcc_all[i].c;
}
// TODO Use MPI_Reduce
- MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
- for (i = 0; i < ewcNR * ewcNR; i++)
+ MPI_Allreduce(cyc_all.data(),
+ buf_all.data(),
+ sc_numWallCycleCountersSquared,
+ MPI_DOUBLE,
+ MPI_SUM,
+ cr->mpi_comm_mysim);
+ for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
{
wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
}
- sfree(buf_all);
- sfree(cyc_all);
}
}
else
#endif
{
- for (i = 0; i < nsum; i++)
+ for (auto key : keysOf(cyclesMain))
{
- cycles_sum[i] = cycles[i];
+ cycles_sum[static_cast<int>(key)] = cyclesMain[key];
+ }
+ if (sc_useCycleSubcounters)
+ {
+ for (auto key : keysOf(cyclesSub))
+ {
+ const int offset = static_cast<int>(key) + sc_numWallCycleCounters;
+ cycles_sum[offset] = cyclesSub[key];
+ }
}
}
int nth_pp,
int nth_pme,
double realtime,
- gmx_wallcycle_t wc,
+ gmx_wallcycle* wc,
const WallcycleCounts& cyc_sum,
const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
const gmx_wallclock_gpu_pme_t* gpu_pme_t)
{
double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
double c2t, c2t_pp, c2t_pme = 0;
- int i, j, npp, nth_tot;
+ int npp, nth_tot;
char buf[STRLEN];
const char* hline =
"-----------------------------------------------------------------------------";
/* When using PME-only nodes, the next line is valid for both
PP-only and PME-only nodes because they started ewcRUN at the
same time. */
- tot = cyc_sum[ewcRUN];
+ tot = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
tot_for_pp = 0;
if (tot <= 0.0)
print_header(fplog, npp, nth_pp, npme, nth_pme);
fprintf(fplog, "%s\n", hline);
- for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+ gmx::EnumerationWrapper<WallCycleCounter> iter;
+ for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+ key != iter.end();
+ ++key)
{
- if (is_pme_subcounter(i))
+
+ if (is_pme_subcounter(*key))
{
/* Do not count these at all */
}
- else if (npme > 0 && is_pme_counter(i))
+ else if (npme > 0 && is_pme_counter(*key))
{
/* Print timing information for PME-only nodes, but add an
* asterisk so the reader of the table can know that the
* walltimes are not meant to add up. The asterisk still
* fits in the required maximum of 19 characters. */
- char buffer[STRLEN];
- snprintf(buffer, STRLEN, "%s *", wcn[i]);
- print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
+ std::string message = gmx::formatString("%s *", enumValuetoString(*key));
+ print_cycles(fplog,
+ c2t_pme,
+ message.c_str(),
+ npme,
+ nth_pme,
+ wc->wcc[*key].n,
+ cyc_sum[static_cast<int>(*key)],
+ tot);
}
else
{
/* Print timing information when it is for a PP or PP+PME
node */
- print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
- tot_for_pp += cyc_sum[i];
+ print_cycles(fplog,
+ c2t_pp,
+ enumValuetoString(*key),
+ npp,
+ nth_pp,
+ wc->wcc[*key].n,
+ cyc_sum[static_cast<int>(*key)],
+ tot);
+ tot_for_pp += cyc_sum[static_cast<int>(*key)];
}
}
- if (wc->wcc_all != nullptr)
+ if (!wc->wcc_all.empty())
{
- for (i = 0; i < ewcNR; i++)
+ for (auto i : keysOf(wc->wcc))
{
- for (j = 0; j < ewcNR; j++)
+ const int countI = static_cast<int>(i);
+ for (auto j : keysOf(wc->wcc))
{
- snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
+ const int countJ = static_cast<int>(j);
+ snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
print_cycles(fplog,
c2t_pp,
buf,
npp,
nth_pp,
- wc->wcc_all[i * ewcNR + j].n,
- wc->wcc_all[i * ewcNR + j].c,
+ wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
+ wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
tot);
}
}
hline);
}
- if (wc->wcc[ewcPMEMESH].n > 0)
+ if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
{
// A workaround to not print breakdown when no subcounters were recorded.
// TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
- std::vector<int> validPmeSubcounterIndices;
- for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+ std::vector<WallCycleCounter> validPmeSubcounterIndices;
+ for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+ key != iter.end();
+ key++)
{
- if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
+ if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
{
- validPmeSubcounterIndices.push_back(i);
+ validPmeSubcounterIndices.push_back(*key);
}
}
{
print_cycles(fplog,
npme > 0 ? c2t_pme : c2t_pp,
- wcn[i],
+ enumValuetoString(i),
npme > 0 ? npme : npp,
nth_pme,
wc->wcc[i].n,
- cyc_sum[i],
+ cyc_sum[static_cast<int>(i)],
tot);
}
fprintf(fplog, "%s\n", hline);
}
}
- if (sc_useCycleSubcounters && wc->wcsc)
+ if (sc_useCycleSubcounters)
{
fprintf(fplog, " Breakdown of PP computation\n");
fprintf(fplog, "%s\n", hline);
- for (i = 0; i < ewcsNR; i++)
+ for (auto key : keysOf(wc->wcsc))
{
- print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
+ print_cycles(fplog,
+ c2t_pp,
+ enumValuetoString(key),
+ npp,
+ nth_pp,
+ wc->wcsc[key].n,
+ cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
+ tot);
}
fprintf(fplog, "%s\n", hline);
}
tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
/* add up the kernel timings */
- for (i = 0; i < 2; i++)
+ for (int i = 0; i < 2; i++)
{
- for (j = 0; j < 2; j++)
+ for (int j = 0; j < 2; j++)
{
tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
}
}
tot_gpu += gpu_nbnxn_t->pruneTime.t;
- tot_cpu_overlap = wc->wcc[ewcFORCE].c;
- if (wc->wcc[ewcPMEMESH].n > 0)
+ tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
+ if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
{
- tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
+ tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
}
tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
- for (i = 0; i < 2; i++)
+ for (int i = 0; i < 2; i++)
{
- for (j = 0; j < 2; j++)
+ for (int j = 0; j < 2; j++)
{
if (gpu_nbnxn_t->ktime[i][j].c)
{
fprintf(fplog, "%s\n", hline);
}
gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
- if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+ if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
{
fprintf(fplog,
"\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
"%.3f\n",
tot_gpu / gpu_nbnxn_t->nb_c,
- tot_cpu_overlap / wc->wcc[ewcFORCE].n,
+ tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
gpu_cpu_ratio);
}
/* only print notes related to CPU-GPU load balance with PME */
- if (wc->wcc[ewcPMEMESH].n > 0)
+ if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
{
fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
"call, so timings are not those of real runs.");
}
- if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
+ if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
+ && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
+ || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
{
/* Only the sim master calls this function, so always print to stderr */
- if (wc->wcc[ewcDOMDEC].n == 0)
+ if (wc->wcc[WallCycleCounter::Domdec].n == 0)
{
GMX_LOG(mdlog.warning)
.asParagraph()
"NOTE: %d %% of the run time was spent in pair search,\n"
" you might want to increase nstlist (this has no effect on "
"accuracy)\n",
- gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+ gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
}
else
{
" %d %% of the run time was spent in pair search,\n"
" you might want to increase nstlist (this has no effect on "
"accuracy)\n",
- gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
- gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+ gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
+ gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
}
}
- if (cyc_sum[ewcMoveE] > tot * 0.05)
+ if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
{
GMX_LOG(mdlog.warning)
.asParagraph()
.appendTextFormatted(
"NOTE: %d %% of the run time was spent communicating energies,\n"
" you might want to increase some nst* mdp options\n",
- gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
+ gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
}
}
-extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
{
if (wc == nullptr)
{
return -1;
}
-
return wc->reset_counters;
}
-extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
{
if (wc == nullptr)
{
return;
}
-
wc->reset_counters = reset_counters;
}
#include <stdio.h>
+#include <array>
+#include <memory>
+#include <vector>
+
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+#ifndef DEBUG_WCYCLE
+/*! \brief Enables consistency checking for the counters.
+ *
+ * If the macro is set to 1, code checks if you stop a counter different from the last
+ * one that was opened and if you do nest too deep.
+ */
+# define DEBUG_WCYCLE 0
+#endif
-typedef struct gmx_wallcycle* gmx_wallcycle_t;
struct t_commrec;
-static constexpr gmx_wallcycle* nullWallcycle = nullptr;
#ifndef DEBUG_WCYCLE
/*! \brief Enables consistency checking for the counters.
# define DEBUG_WCYCLE 0
#endif
-enum
+enum class WallCycleCounter : int
{
- ewcRUN,
- ewcSTEP,
- ewcPPDURINGPME,
- ewcDOMDEC,
- ewcDDCOMMLOAD,
- ewcDDCOMMBOUND,
- ewcVSITECONSTR,
- ewcPP_PMESENDX,
- ewcNS,
- ewcLAUNCH_GPU,
- ewcMOVEX,
- ewcFORCE,
- ewcMOVEF,
- ewcPMEMESH,
- ewcPME_REDISTXF,
- ewcPME_SPREAD,
- ewcPME_GATHER,
- ewcPME_FFT,
- ewcPME_FFTCOMM,
- ewcLJPME,
- ewcPME_SOLVE,
- ewcPMEWAITCOMM,
- ewcPP_PMEWAITRECVF,
- ewcWAIT_GPU_PME_SPREAD,
- ewcPME_FFT_MIXED_MODE,
- ewcPME_SOLVE_MIXED_MODE,
- ewcWAIT_GPU_PME_GATHER,
- ewcWAIT_GPU_BONDED,
- ewcPME_GPU_F_REDUCTION,
- ewcWAIT_GPU_NB_NL,
- ewcWAIT_GPU_NB_L,
- ewcWAIT_GPU_STATE_PROPAGATOR_DATA,
- ewcNB_XF_BUF_OPS,
- ewcVSITESPREAD,
- ewcPULLPOT,
- ewcAWH,
- ewcTRAJ,
- ewcUPDATE,
- ewcCONSTR,
- ewcMoveE,
- ewcROT,
- ewcROTadd,
- ewcSWAP,
- ewcIMD,
- ewcTEST,
- ewcNR
+ Run,
+ Step,
+ PpDuringPme,
+ Domdec,
+ DDCommLoad,
+ DDCommBound,
+ VsiteConstr,
+ PpPmeSendX,
+ NS,
+ LaunchGpu,
+ MoveX,
+ Force,
+ MoveF,
+ PmeMesh,
+ PmeRedistXF,
+ PmeSpread,
+ PmeGather,
+ PmeFft,
+ PmeFftComm,
+ LJPme,
+ PmeSolve,
+ PmeWaitComm,
+ PpPmeWaitRecvF,
+ WaitGpuPmeSpread,
+ PmeFftMixedMode,
+ PmeSolveMixedMode,
+ WaitGpuPmeGather,
+ WaitGpuBonded,
+ PmeGpuFReduction,
+ WaitGpuNbNL,
+ WaitGpuNbL,
+ WaitGpuStatePropagatorData,
+ NbXFBufOps,
+ VsiteSpread,
+ PullPot,
+ Awh,
+ Traj,
+ Update,
+ Constr,
+ MoveE,
+ Rot,
+ RotAdd,
+ Swap,
+ Imd,
+ Test,
+ Count
};
-enum
+enum class WallCycleSubCounter : int
{
- ewcsDD_REDIST,
- ewcsDD_GRID,
- ewcsDD_SETUPCOMM,
- ewcsDD_MAKETOP,
- ewcsDD_MAKECONSTR,
- ewcsDD_TOPOTHER,
- ewcsDD_GPU,
- ewcsNBS_GRID_LOCAL,
- ewcsNBS_GRID_NONLOCAL,
- ewcsNBS_SEARCH_LOCAL,
- ewcsNBS_SEARCH_NONLOCAL,
- ewcsLISTED,
- ewcsLISTED_FEP,
- ewcsRESTRAINTS,
- ewcsLISTED_BUF_OPS,
- ewcsNONBONDED_PRUNING,
- ewcsNONBONDED_KERNEL,
- ewcsNONBONDED_CLEAR,
- ewcsNONBONDED_FEP,
- ewcsLAUNCH_GPU_NONBONDED,
- ewcsLAUNCH_GPU_BONDED,
- ewcsLAUNCH_GPU_PME,
- ewcsLAUNCH_STATE_PROPAGATOR_DATA,
- ewcsEWALD_CORRECTION,
- ewcsNB_X_BUF_OPS,
- ewcsNB_F_BUF_OPS,
- ewcsCLEAR_FORCE_BUFFER,
- ewcsLAUNCH_GPU_NB_X_BUF_OPS,
- ewcsLAUNCH_GPU_NB_F_BUF_OPS,
- ewcsLAUNCH_GPU_MOVEX,
- ewcsLAUNCH_GPU_MOVEF,
- ewcsLAUNCH_GPU_UPDATE_CONSTRAIN,
- ewcsTEST,
- ewcsNR
+ DDRedist,
+ DDGrid,
+ DDSetupComm,
+ DDMakeTop,
+ DDMakeConstr,
+ DDTopOther,
+ DDGpu,
+ NBSGridLocal,
+ NBSGridNonLocal,
+ NBSSearchLocal,
+ NBSSearchNonLocal,
+ Listed,
+ ListedFep,
+ Restraints,
+ ListedBufOps,
+ NonbondedPruning,
+ NonbondedKernel,
+ NonbondedClear,
+ NonbondedFep,
+ LaunchGpuNonBonded,
+ LaunchGpuBonded,
+ LaunchGpuPme,
+ LaunchStatePropagatorData,
+ EwaldCorrection,
+ NBXBufOps,
+ NBFBufOps,
+ ClearForceBuffer,
+ LaunchGpuNBXBufOps,
+ LaunchGpuNBFBufOps,
+ LaunchGpuMoveX,
+ LaunchGpuMoveF,
+ LaunchGpuUpdateConstrain,
+ Test,
+ Count
};
-static constexpr const bool sc_useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
+static constexpr int sc_numWallCycleCounters = static_cast<int>(WallCycleCounter::Count);
+static constexpr int sc_numWallCycleSubCounters = static_cast<int>(WallCycleSubCounter::Count);
+static constexpr int sc_numWallCycleCountersSquared = sc_numWallCycleCounters * sc_numWallCycleCounters;
+static constexpr bool sc_useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
struct wallcc_t
{
struct gmx_wallcycle
{
- wallcc_t* wcc;
+ gmx::EnumerationArray<WallCycleCounter, wallcc_t> wcc;
/* did we detect one or more invalid cycle counts */
bool haveInvalidCount;
/* variables for testing/debugging */
- bool wc_barrier;
- wallcc_t* wcc_all;
- int wc_depth;
+ bool wc_barrier;
+ std::vector<wallcc_t> wcc_all;
+ int wc_depth;
#if DEBUG_WCYCLE
- int* counterlist;
- int count_depth;
- bool isMasterRank;
+ std::array<WallCycleCounter, c_MaxWallCycleDepth> counterlist;
+ int count_depth;
+ bool isMasterRank;
#endif
- int ewc_prev;
- gmx_cycles_t cycle_prev;
- int64_t reset_counters;
- const t_commrec* cr;
- wallcc_t* wcsc;
+ WallCycleCounter ewc_prev;
+ gmx_cycles_t cycle_prev;
+ int64_t reset_counters;
+ const t_commrec* cr;
+ gmx::EnumerationArray<WallCycleSubCounter, wallcc_t> wcsc;
};
-//! Returns whether cycle counting is supported.
+//! Returns if cycle counting is supported
bool wallcycle_have_counter();
-/*! \brief
- * Returns a wallcycle datastructure.
- *
- * If cycle counting is not supported, returns nullptr instead.
- */
-gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr);
-
-//! Cleans up wallcycle structure.
-void wallcycle_destroy(gmx_wallcycle_t wc);
+//! Returns the wall cycle structure.
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr);
//! Adds custom barrier for wallcycle counting.
void wallcycleBarrier(gmx_wallcycle* wc);
-inline void wallcycle_all_start(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c);
+/* Returns the cumulative count and sub cycle count for ewcs */
+
+inline void wallcycle_all_start(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle)
{
wc->ewc_prev = ewc;
wc->cycle_prev = cycle;
}
-inline void wallcycle_all_stop(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle)
+inline void wallcycle_all_stop(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle)
{
- const int prev = wc->ewc_prev;
- const int current = ewc;
- wc->wcc_all[prev * ewcNR + current].n += 1;
- wc->wcc_all[prev * ewcNR + current].c += cycle - wc->cycle_prev;
+ const int prev = static_cast<int>(wc->ewc_prev);
+ const int current = static_cast<int>(ewc);
+ wc->wcc_all[prev * sc_numWallCycleCounters + current].n += 1;
+ wc->wcc_all[prev * sc_numWallCycleCounters + current].c += cycle - wc->cycle_prev;
}
-//! Starts the cycle counter for \c ewc (and increases the call count).
-inline void wallcycle_start(gmx_wallcycle_t wc, int ewc)
+//! Starts the cycle counter (and increases the call count)
+inline void wallcycle_start(gmx_wallcycle* wc, WallCycleCounter ewc)
{
if (wc == nullptr)
{
#endif
gmx_cycles_t cycle = gmx_cycles_read();
wc->wcc[ewc].start = cycle;
- if (wc->wcc_all)
+ if (!wc->wcc_all.empty())
{
wc->wc_depth++;
- if (ewc == ewcRUN)
+ if (ewc == WallCycleCounter::Run)
{
wallcycle_all_start(wc, ewc, cycle);
}
}
}
-//! Starts the cycle counter for \c ewc without increasing the call count.
-inline void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
+//! Starts the cycle counter without increasing the call count
+inline void wallcycle_start_nocount(gmx_wallcycle* wc, WallCycleCounter ewc)
{
if (wc == nullptr)
{
wc->wcc[ewc].n++;
}
-//! Stop the cycle count for \c ewc, returns the last cycle count.
-inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
+//! Stop the cycle count for ewc , returns the last cycle count
+inline double wallcycle_stop(gmx_wallcycle* wc, WallCycleCounter ewc)
{
gmx_cycles_t cycle, last;
}
wc->wcc[ewc].c += last;
wc->wcc[ewc].n++;
- if (wc->wcc_all)
+ if (!wc->wcc_all.empty())
{
wc->wc_depth--;
- if (ewc == ewcRUN)
+ if (ewc == WallCycleCounter::Run)
{
wallcycle_all_stop(wc, ewc, cycle);
}
return last;
}
-//! Only increment call count for \c ewc by one.
-inline void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+//! Only increment call count for ewc by one
+inline void wallcycle_increment_event_count(gmx_wallcycle* wc, WallCycleCounter ewc)
{
if (wc == nullptr)
{
wc->wcc[ewc].n++;
}
-//! Returns the cumulative count and cycle count for \c ewc.
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c);
-
-//! Returns the cumulative count and sub cycle count for \c ewcs.
-void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c);
+//! Returns the cumulative count and cycle count for ewc
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c);
-//! Resets all cycle counters to zero.
-void wallcycle_reset_all(gmx_wallcycle_t wc);
+//! Resets all cycle counters to zero
+void wallcycle_reset_all(gmx_wallcycle* wc);
-//! Scale the cycle counts to reflect how many threads run for that number of cycles.
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme);
+//! Scale the cycle counts to reflect how many threads run for that number of cycles
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme);
-//! Return reset_counters.
-int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
+//! Return reset_counters from wc struct
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc);
-//! Set reset_counters.
-void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters);
+//! Set reset_counters
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters);
-//! Set the start sub cycle count for \c ewcs.
-inline void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
+//! Set the start sub cycle count for ewcs
+inline void wallcycle_sub_start(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
{
if (sc_useCycleSubcounters && wc != nullptr)
{
}
}
-//! Set the start sub cycle count for \c ewcs without increasing the call count.
-inline void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
+//! Set the start sub cycle count for ewcs without increasing the call count
+inline void wallcycle_sub_start_nocount(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
{
if (sc_useCycleSubcounters && wc != nullptr)
{
}
}
-//! Stop the sub cycle count for \c ewcs.
-inline void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
+//! Stop the sub cycle count for ewcs
+inline void wallcycle_sub_stop(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
{
if (sc_useCycleSubcounters && wc != nullptr)
{
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2008, The GROMACS development team.
* Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <array>
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/basedefinitions.h"
struct t_commrec;
class MDLogger;
}
-typedef struct gmx_wallcycle* gmx_wallcycle_t;
struct gmx_wallclock_gpu_nbnxn_t;
struct gmx_wallclock_gpu_pme_t;
-typedef std::array<double, int(ewcNR) + int(ewcsNR)> WallcycleCounts;
+using WallcycleCounts = std::array<double, sc_numWallCycleCounters + sc_numWallCycleSubCounters>;
/* Convenience typedef */
-WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc);
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc);
/* Return a vector of the sum of cycle counts over the nodes in
cr->mpi_comm_mysim. */
int nth_pp,
int nth_pme,
double realtime,
- gmx_wallcycle_t wc,
+ gmx_wallcycle* wc,
const WallcycleCounts& cyc_sum,
const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
const gmx_wallclock_gpu_pme_t* gpu_pme_t);