From acb7e888496456d23f0d94f0d53f50f29713f098 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 27 Jun 2019 17:26:50 +0200 Subject: [PATCH] Create dedicated subcounter for nonbonded FEP Now all nonbonded work has their own separate subcoutners which allows measuring the performance of each task separately. Refs #2997 Change-Id: I601445364592923d08087a858da4629b0b58ae76 --- src/gromacs/mdlib/sim_util.cpp | 13 ++----------- src/gromacs/nbnxm/kerneldispatch.cpp | 13 ++++++++----- src/gromacs/nbnxm/nbnxm.h | 3 ++- src/gromacs/timing/wallcycle.cpp | 5 +++-- src/gromacs/timing/wallcycle.h | 5 +++-- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 8cb7e5a2c2..baecb96ba1 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -346,16 +346,9 @@ static void do_nb_verlet(t_forcerec *fr, nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec); wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING); } - - wallcycle_sub_start(wcycle, ewcsNONBONDED); } nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle); - - if (!nbv->useGpu()) - { - wallcycle_sub_stop(wcycle, ewcsNONBONDED); - } } static inline void clear_rvecs_omp(int n, rvec v[]) @@ -1259,20 +1252,18 @@ void do_force(FILE *fplog, /* Calculate the local and non-local free energy interactions here. * Happens here on the CPU both with and without GPU. */ - wallcycle_sub_start(wcycle, ewcsNONBONDED); nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local, fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms, inputrec->fepvals, lambda.data(), - enerd, flags, nrnb); + enerd, flags, nrnb, wcycle); if (havePPDomainDecomposition(cr)) { nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal, fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms, inputrec->fepvals, lambda.data(), - enerd, flags, nrnb); + enerd, flags, nrnb, wcycle); } - wallcycle_sub_stop(wcycle, ewcsNONBONDED); } if (!bUseOrEmulGPU) diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index 52bb7c2e82..99b96d11b6 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -241,7 +241,7 @@ nbnxn_kernel_cpu(const PairlistSet &pairlistSet, gmx::ArrayRef pairlists = pairlistSet.cpuLists(); int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded); - wallcycle_sub_start(wcycle, ewcsNBFCLEARBUF); + wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR); #pragma omp parallel for schedule(static) num_threads(nthreads) for (int nb = 0; nb < pairlists.ssize(); nb++) { @@ -258,8 +258,8 @@ nbnxn_kernel_cpu(const PairlistSet &pairlistSet, if (nb == 0) { - wallcycle_sub_stop(wcycle, ewcsNBFCLEARBUF); - wallcycle_sub_start(wcycle, ewcsNBFKERNEL); + wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR); + wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL); } // TODO: Change to reference @@ -393,7 +393,7 @@ nbnxn_kernel_cpu(const PairlistSet &pairlistSet, } } } - wallcycle_sub_stop(wcycle, ewcsNBFKERNEL); + wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL); if (forceFlags & GMX_FORCE_ENERGY) { @@ -527,7 +527,8 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali real *lambda, gmx_enerdata_t *enerd, const int forceFlags, - t_nrnb *nrnb) + t_nrnb *nrnb, + gmx_wallcycle *wcycle) { const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists(); @@ -566,6 +567,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads"); + wallcycle_sub_start(wcycle, ewcsNONBONDED_FEP); #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) for (int th = 0; th < nbl_fep.ssize(); th++) { @@ -622,4 +624,5 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT]; } } + wallcycle_sub_stop(wcycle, ewcsNONBONDED_FEP); } diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 3c2d1f6af1..b8bdb853cb 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -291,7 +291,8 @@ struct nonbonded_verlet_t real *lambda, gmx_enerdata_t *enerd, int forceFlags, - t_nrnb *nrnb); + t_nrnb *nrnb, + gmx_wallcycle *wcycle); //! Add the forces stored in nbat to f, zeros the forces in nbat */ void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index ffe8b1eaa2..e558a4b445 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -126,8 +126,9 @@ static const char *wcsn[ewcsNR] = "Restraints F", "Listed buffer ops.", "Nonbonded pruning", - "Nonbonded F", - "NB F kernel", "NB F clear buf", + "Nonbonded F kernel", + "Nonbonded F clear", + "Nonbonded FEP", "Launch NB GPU tasks", "Launch Bonded GPU tasks", "Launch PME GPU tasks", diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 58d1e50033..f2377ebe5c 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -72,8 +72,9 @@ enum { ewcsRESTRAINTS, ewcsLISTED_BUF_OPS, ewcsNONBONDED_PRUNING, - ewcsNONBONDED, - ewcsNBFKERNEL, ewcsNBFCLEARBUF, + ewcsNONBONDED_KERNEL, + ewcsNONBONDED_CLEAR, + ewcsNONBONDED_FEP, ewcsLAUNCH_GPU_NONBONDED, ewcsLAUNCH_GPU_BONDED, ewcsLAUNCH_GPU_PME, -- 2.22.0