From 472216d5a7b07347b5abb8dde0e6e4f1ce5ecb99 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Tue, 19 Feb 2019 15:02:27 +0100 Subject: [PATCH] Hide internals of nbnxm parlist Introduced the PairlistSet class which holds all data related with the nbnxm pair lists. The actual details of the lists are no longer available outside the nbnxm module. Change-Id: If4f36a379f2a6a133435b8ee82de8abfed5c63fd --- src/gromacs/ewald/pme_load_balancing.cpp | 11 +- src/gromacs/mdlib/sim_util.cpp | 41 ++--- src/gromacs/mdrun/md.cpp | 2 +- src/gromacs/nbnxm/atomdata.cpp | 30 +++- src/gromacs/nbnxm/atomdata.h | 8 +- src/gromacs/nbnxm/grid.cpp | 4 +- src/gromacs/nbnxm/kerneldispatch.cpp | 41 +++-- src/gromacs/nbnxm/nbnxm.h | 190 ++++++++++++++++------ src/gromacs/nbnxm/nbnxm_setup.cpp | 102 +++++++----- src/gromacs/nbnxm/pairlist.cpp | 57 +++++-- src/gromacs/nbnxm/pairlist.h | 25 +-- src/gromacs/nbnxm/pairlistset.cpp | 14 -- src/gromacs/nbnxm/pairlistset.h | 4 +- src/gromacs/nbnxm/prunekerneldispatch.cpp | 20 ++- 14 files changed, 327 insertions(+), 222 deletions(-) diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp index 778c7c2c48..2960abf33f 100644 --- a/src/gromacs/ewald/pme_load_balancing.cpp +++ b/src/gromacs/ewald/pme_load_balancing.cpp @@ -793,11 +793,8 @@ pme_load_balance(pme_load_balancing_t *pme_lb, set = &pme_lb->setup[pme_lb->cur]; - NbnxnListParameters *listParams = nbv->listParams.get(); - ic->rcoulomb = set->rcut_coulomb; - listParams->rlistOuter = set->rlistOuter; - listParams->rlistInner = set->rlistInner; + nbv->pairlistSets_->changeRadii(set->rlistOuter, set->rlistInner); ic->ewaldcoeff_q = set->ewaldcoeff_q; /* TODO: centralize the code that sets the potentials shifts */ if (ic->coulomb_modifier == eintmodPOTSHIFT) @@ -825,7 +822,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, /* We always re-initialize the tables whether they are used or not */ init_interaction_const_tables(nullptr, ic, rtab); - Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, listParams); + Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, &nbv->pairlistSets().params()); if (!pme_lb->bSepPMERanks) { @@ -997,7 +994,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, * This also ensures that we won't disable the currently * optimal setting during a second round of PME balancing. */ - set_dd_dlb_max_cutoff(cr, fr->nbv->listParams->rlistOuter); + set_dd_dlb_max_cutoff(cr, fr->nbv->pairlistSets().params().rlistOuter); } } @@ -1014,7 +1011,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, step); /* Update deprecated rlist in forcerec to stay in sync with fr->nbv */ - fr->rlist = fr->nbv->listParams->rlistOuter; + fr->rlist = fr->nbv->pairlistSets().params().rlistOuter; if (ir.eDispCorr != edispcNO) { diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 6f3f2e3150..f8f7c388ae 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -416,8 +416,7 @@ static void do_nb_verlet(t_forcerec *fr, /* When dynamic pair-list pruning is requested, we need to prune * at nstlistPrune steps. */ - if (nbv->listParams->useDynamicPruning && - nbnxnIsDynamicPairlistPruningStep(*nbv, ilocality, step)) + if (nbv->pairlistSets().isDynamicPairlistPruningStep(step)) { /* Prune the pair-list beyond fr->ic->rlistPrune using * the current coordinates of the atoms. @@ -430,7 +429,7 @@ static void do_nb_verlet(t_forcerec *fr, wallcycle_sub_start(wcycle, ewcsNONBONDED); } - NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb); + nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, fr, enerd, nrnb); if (!nbv->useGpu()) { @@ -761,8 +760,8 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv wallcycle_start(wcycle, ewcWAIT_GPU_NB_L); wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local, - nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle); + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, + as_rvec_array(force->unpaddedArrayRef().data()), wcycle); } } } @@ -790,9 +789,11 @@ static inline void launchGpuRollingPruning(const t_commrec *cr, * With domain decomposition we alternate local and non-local * pruning at even and odd steps. */ - int numRollingParts = nbv->listParams->numRollingParts; - GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency"); - int stepWithCurrentList = nbnxnNumStepsWithPairlist(*nbv, Nbnxm::InteractionLocality::Local, step); + int numRollingParts = nbv->pairlistSets().params().numRollingParts; + GMX_ASSERT(numRollingParts == nbv->pairlistSets().params().nstlistPrune/2, + "Since we alternate local/non-local at even/odd steps, " + "we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency"); + int stepWithCurrentList = nbv->pairlistSets().numStepsWithPairlist(step); bool stepIsEven = ((stepWithCurrentList & 1) == 0); if (stepWithCurrentList > 0 && stepWithCurrentList < inputrec->nstlist - 1 && @@ -1274,22 +1275,18 @@ static void do_force_cutsVERLET(FILE *fplog, step, nrnb, wcycle); } - const Nbnxm::InteractionLocality iloc = - (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal); - /* Add all the non-bonded force to the normal force array. * This can be split into a local and a non-local part when overlapping * communication with calculation with domain decomposition. */ wallcycle_stop(wcycle, ewcFORCE); - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle); + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, f, wcycle); wallcycle_start_nocount(wcycle, ewcFORCE); - /* if there are multiple fshift output buffers reduce them */ - if ((flags & GMX_FORCE_VIRIAL) && - nbv->pairlistSet(iloc).nnbl > 1) + /* If there are multiple fshift output buffers we need to reduce them */ + if (flags & GMX_FORCE_VIRIAL) { /* This is not in a subcounter because it takes a negligible and constant-sized amount of time */ @@ -1343,12 +1340,8 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_stop(wcycle, ewcFORCE); } - /* skip the reduction if there was no non-local work to do */ - if (!nbv->pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty()) - { - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal, - nbv->nbat, f, wcycle); - } + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, + f, wcycle); } } @@ -1438,7 +1431,7 @@ static void do_force_cutsVERLET(FILE *fplog, Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags); /* Is dynamic pair-list pruning activated? */ - if (nbv->listParams->useDynamicPruning) + if (nbv->pairlistSets().params().useDynamicPruning) { launchGpuRollingPruning(cr, nbv, inputrec, step); } @@ -1466,8 +1459,8 @@ static void do_force_cutsVERLET(FILE *fplog, * on the non-alternating path. */ if (bUseOrEmulGPU && !alternateGpuWait) { - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local, - nbv->nbat, f, wcycle); + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, + f, wcycle); } if (DOMAINDECOMP(cr)) { diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 09dd5ace38..1d3113172f 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -412,7 +412,7 @@ void gmx::Integrator::do_md() if (bPMETune) { pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, - *fr->ic, *fr->nbv->listParams, fr->pmedata, use_GPU(fr->nbv), + *fr->ic, fr->nbv->pairlistSets().params(), fr->pmedata, use_GPU(fr->nbv), &bPMETunePrinting); } diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 0fad12d1f8..1de1af8f11 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -54,6 +54,7 @@ #include "gromacs/mdtypes/mdatom.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/nbnxm_geometry.h" +#include "gromacs/nbnxm/pairlist.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/simd/simd.h" #include "gromacs/timing/wallcycle.h" @@ -1171,7 +1172,7 @@ nbnxn_atomdata_reduce_reals_simd(real gmx_unused * gmx_restrict dest, static void nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search *nbs, const nbnxn_atomdata_t *nbat, - gmx::ArrayRef out, + gmx::ArrayRef out, int nfa, int a0, int a1, rvec *f) @@ -1465,12 +1466,18 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, } /* Add the force array(s) from nbnxn_atomdata_t to f */ -void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, - const Nbnxm::AtomLocality locality, - nbnxn_atomdata_t *nbat, - rvec *f, - gmx_wallcycle *wcycle) +void +nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, + rvec *f, + gmx_wallcycle *wcycle) { + /* Skip the non-local reduction if there was no non-local work to do */ + if (locality == Nbnxm::AtomLocality::NonLocal && + pairlistSets().pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty()) + { + return; + } + wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); @@ -1521,7 +1528,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, { try { - nbnxn_atomdata_add_nbat_f_to_f_part(nbs, nbat, + nbnxn_atomdata_add_nbat_f_to_f_part(nbs.get(), nbat, nbat->out, 1, a0+((th+0)*na)/nth, @@ -1543,6 +1550,15 @@ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, { gmx::ArrayRef outputBuffers = nbat->out; + if (outputBuffers.size() == 1) + { + /* When there is a single output object, with CPU or GPU, shift forces + * have been written directly to the main buffer instead of to the + * (single) thread local output object. There is nothing to reduce. + */ + return; + } + for (int s = 0; s < SHIFTS; s++) { rvec sum; diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index fbd4dd3c00..7deeb2c181 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -51,6 +51,7 @@ class MDLogger; struct nbnxn_atomdata_t; struct nbnxn_search; +struct nonbonded_verlet_t; struct t_mdatoms; struct gmx_wallcycle; @@ -107,13 +108,6 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, nbnxn_atomdata_t *nbat, gmx_wallcycle *wcycle); -/* Add the forces stored in nbat to f, zeros the forces in nbat */ -void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, - Nbnxm::AtomLocality locality, - nbnxn_atomdata_t *nbat, - rvec *f, - gmx_wallcycle *wcycle); - /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec *fshift); diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp index 4b98d53a3c..19ed3d1021 100644 --- a/src/gromacs/nbnxm/grid.cpp +++ b/src/gromacs/nbnxm/grid.cpp @@ -1428,8 +1428,8 @@ void nbnxn_put_on_grid(nonbonded_verlet_t *nbv, grid->bSimple = nbv->pairlistIsSimple(); - grid->na_c = IClusterSizePerListType[nbv->listParams->pairlistType]; - grid->na_cj = JClusterSizePerListType[nbv->listParams->pairlistType]; + grid->na_c = IClusterSizePerListType[nbv->pairlistSets().params().pairlistType]; + grid->na_cj = JClusterSizePerListType[nbv->pairlistSets().params().pairlistType]; grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c; grid->na_c_2log = get_2log(grid->na_c); diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index fcac007c02..b4b79b0b11 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -422,15 +422,14 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, } static void accountFlops(t_nrnb *nrnb, + const nbnxn_pairlist_set_t &pairlistSet, const nonbonded_verlet_t &nbv, - const Nbnxm::InteractionLocality iLocality, const interaction_const_t &ic, const int forceFlags) { - const nbnxn_pairlist_set_t &pairlistSet = nbv.pairlistSet(iLocality); - const bool usingGpuKernels = nbv.useGpu(); + const bool usingGpuKernels = nbv.useGpu(); - int enr_nbnxn_kernel_ljc; + int enr_nbnxn_kernel_ljc; if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT) { enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF; @@ -481,25 +480,25 @@ static void accountFlops(t_nrnb *nrnb, } } -void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, - Nbnxm::InteractionLocality iLocality, - const interaction_const_t &ic, - int forceFlags, - int clearF, - t_forcerec *fr, - gmx_enerdata_t *enerd, - t_nrnb *nrnb) +void +nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality, + const interaction_const_t &ic, + int forceFlags, + int clearF, + t_forcerec *fr, + gmx_enerdata_t *enerd, + t_nrnb *nrnb) { - const nbnxn_pairlist_set_t &pairlistSet = nbv->pairlistSet(iLocality); + const nbnxn_pairlist_set_t &pairlistSet = pairlistSets().pairlistSet(iLocality); - switch (nbv->kernelSetup().kernelType) + switch (kernelSetup().kernelType) { case Nbnxm::KernelType::Cpu4x4_PlainC: case Nbnxm::KernelType::Cpu4xN_Simd_4xN: case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: nbnxn_kernel_cpu(pairlistSet, - nbv->kernelSetup(), - nbv->nbat, + kernelSetup(), + nbat, ic, fr->shift_vec, forceFlags, @@ -512,16 +511,16 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, break; case Nbnxm::KernelType::Gpu8x8x8: - Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality); + Nbnxm::gpu_launch_kernel(gpu_nbv, forceFlags, iLocality); break; case Nbnxm::KernelType::Cpu8x8x8_PlainC: nbnxn_kernel_gpu_ref(pairlistSet.nblGpu[0], - nbv->nbat, &ic, + nbat, &ic, fr->shift_vec, forceFlags, clearF, - nbv->nbat->out[0].f, + nbat->out[0].f, fr->fshift[0], enerd->grpp.ener[egCOULSR], fr->bBHAM ? @@ -534,7 +533,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, } - accountFlops(nrnb, *nbv, iLocality, ic, forceFlags); + accountFlops(nrnb, pairlistSet, *this, ic, forceFlags); } void @@ -549,7 +548,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali const int forceFlags, t_nrnb *nrnb) { - const gmx::ArrayRef nbl_fep = pairlistSet(iLocality).nbl_fep; + const gmx::ArrayRef nbl_fep = pairlistSets().pairlistSet(iLocality).nbl_fep; /* When the first list is empty, all are empty and there is nothing to do */ if (nbl_fep[0]->nrj == 0) diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 1b020d5916..c15c0270c6 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -101,7 +101,6 @@ #include #include "gromacs/math/vectypes.h" -#include "gromacs/nbnxm/pairlist.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/real.h" @@ -116,10 +115,12 @@ struct gmx_domdec_zones_t; struct gmx_enerdata_t; struct gmx_hw_info_t; struct gmx_mtop_t; +struct gmx_wallcycle; struct interaction_const_t; struct nbnxn_pairlist_set_t; struct nbnxn_search; struct nonbonded_verlet_t; +enum class PairlistType; struct t_blocka; struct t_commrec; struct t_lambda; @@ -134,6 +135,31 @@ class MDLogger; class UpdateGroupsCog; } +namespace Nbnxm +{ +enum class KernelType; +} + +/*! \libinternal + * \brief The setup for generating and pruning the nbnxn pair list. + * + * Without dynamic pruning rlistOuter=rlistInner. + */ +struct NbnxnListParameters +{ + /*! \brief Constructor producing a struct with dynamic pruning disabled + */ + NbnxnListParameters(Nbnxm::KernelType kernelType, + real rlist); + + PairlistType pairlistType; //!< The type of cluster-pair list + bool useDynamicPruning; //!< Are we using dynamic pair-list pruning + int nstlistPrune; //!< Pair-list dynamic pruning interval + real rlistOuter; //!< Cut-off of the larger, outer pair-list + real rlistInner; //!< Cut-off of the smaller, inner pair-list + int numRollingParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning +}; + /*! \brief Resources that can be used to execute non-bonded kernels on */ enum class NonbondedResource : int { @@ -216,6 +242,96 @@ void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t *pairlistSet, struct nonbonded_verlet_t { public: + class PairlistSets + { + public: + PairlistSets(const NbnxnListParameters &listParams, + bool haveMultipleDomains, + int minimumIlistCountForGpuBalancing); + + //! Construct the pairlist set for the given locality + void construct(Nbnxm::InteractionLocality iLocality, + nbnxn_search *nbs, + nbnxn_atomdata_t *nbat, + const t_blocka *excl, + Nbnxm::KernelType kernelbType, + int64_t step, + t_nrnb *nrnb); + + //! Dispatches the dynamic pruning kernel for the given locality + void dispatchPruneKernel(Nbnxm::InteractionLocality iLocality, + const nbnxn_atomdata_t *nbat, + const rvec *shift_vec, + Nbnxm::KernelType kernelbType); + + //! Returns the pair list parameters + const NbnxnListParameters ¶ms() const + { + return params_; + } + + //! Returns the number of steps performed with the current pair list + int numStepsWithPairlist(int64_t step) const + { + return step - outerListCreationStep_; + } + + //! Returns whether step is a dynamic list pruning step, for CPU lists only + bool isDynamicPairlistPruningStep(int64_t step) const + { + return (params_.useDynamicPruning && + numStepsWithPairlist(step) % params_.nstlistPrune == 0); + } + + //! Changes the pair-list outer and inner radius + void changeRadii(real rlistOuter, + real rlistInner) + { + params_.rlistOuter = rlistOuter; + params_.rlistInner = rlistInner; + } + + //! Returns the pair-list set for the given locality + const nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) const + { + if (iLocality == Nbnxm::InteractionLocality::Local) + { + return *localSet_; + } + else + { + GMX_ASSERT(nonlocalSet_, "Need a non-local set when requesting access"); + return *nonlocalSet_; + } + } + + private: + //! Returns the pair-list set for the given locality + nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) + { + if (iLocality == Nbnxm::InteractionLocality::Local) + { + return *localSet_; + } + else + { + GMX_ASSERT(nonlocalSet_, "Need a non-local set when requesting access"); + return *nonlocalSet_; + } + } + + //! Parameters for the search and list pruning setup + NbnxnListParameters params_; + //! Pair list balancing parameter for use with GPU + int minimumIlistCountForGpuBalancing_; + //! Local pairlist set + std::unique_ptr localSet_; + //! Non-local pairlist set + std::unique_ptr nonlocalSet_; + //! MD step at with the outer lists in pairlistSets_ were created + int64_t outerListCreationStep_; + }; + //! Returns whether a GPU is use for the non-bonded calculations bool useGpu() const { @@ -237,34 +353,32 @@ struct nonbonded_verlet_t //! Initialize the pair list sets, TODO this should be private void initPairlistSets(bool haveMultipleDomains); - //! Returns a reference to the pairlist set for the requested locality - const nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) const - { - GMX_ASSERT(static_cast(iLocality) < pairlistSets_.size(), - "The requested locality should be in the list"); - return pairlistSets_[static_cast(iLocality)]; - } - //! Constructs the pairlist for the given locality void constructPairlist(Nbnxm::InteractionLocality iLocality, const t_blocka *excl, int64_t step, - t_nrnb *nrnb) + t_nrnb *nrnb); + + //! Returns a reference to the pairlist sets + const PairlistSets &pairlistSets() const { - nbnxn_make_pairlist(this, iLocality, &pairlistSets_[static_cast(iLocality)], excl, step, nrnb); + return *pairlistSets_; } //! Dispatches the dynamic pruning kernel for the given locality void dispatchPruneKernel(Nbnxm::InteractionLocality iLocality, - const rvec *shift_vec) - { - GMX_ASSERT(static_cast(iLocality) < pairlistSets_.size(), - "The requested locality should be in the list"); - NbnxnDispatchPruneKernel(&pairlistSets_[static_cast(iLocality)], - kernelSetup_.kernelType, nbat, shift_vec); - } - - //! Dispatches the non-bonded free-energy kernel, always runs on the CPU + const rvec *shift_vec); + + //! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU + void dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality, + const interaction_const_t &ic, + int forceFlags, + int clearF, + t_forcerec *fr, + gmx_enerdata_t *enerd, + t_nrnb *nrnb); + + //! Executes the non-bonded free-energy kernel, always runs on the CPU void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality, t_forcerec *fr, rvec x[], @@ -276,6 +390,11 @@ struct nonbonded_verlet_t int forceFlags, t_nrnb *nrnb); + //! Add the forces stored in nbat to f, zeros the forces in nbat */ + void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, + rvec *f, + gmx_wallcycle *wcycle); + //! Return the kernel setup const Nbnxm::KernelSetup &kernelSetup() const { @@ -288,14 +407,12 @@ struct nonbonded_verlet_t kernelSetup_ = kernelSetup; } - //! Parameters for the search and list pruning setup - std::unique_ptr listParams; + // TODO: Make all data members private + public: + //! All data related to the pair lists + std::unique_ptr pairlistSets_; //! Working data for constructing the pairlists std::unique_ptr nbs; - private: - //! Local and, optionally, non-local pairlist sets - std::vector pairlistSets_; - public: //! Atom data nbnxn_atomdata_t *nbat; @@ -305,7 +422,6 @@ struct nonbonded_verlet_t public: gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */ - int min_ci_balanced; /**< pair list balancing parameter used for the 8x8x8 GPU kernels */ }; namespace Nbnxm @@ -371,24 +487,4 @@ void nbnxn_set_atomorder(nbnxn_search *nbs); /*! \brief Returns the index position of the atoms on the pairlist search grid */ gmx::ArrayRef nbnxn_get_gridindices(const nbnxn_search* nbs); -/*! \brief Returns the number of steps performed with the current pair list */ -int nbnxnNumStepsWithPairlist(const nonbonded_verlet_t &nbv, - Nbnxm::InteractionLocality ilocality, - int64_t step); - -/*! \brief Returns whether step is a dynamic list pruning step */ -bool nbnxnIsDynamicPairlistPruningStep(const nonbonded_verlet_t &nbv, - Nbnxm::InteractionLocality ilocality, - int64_t step); - -/*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */ -void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, - Nbnxm::InteractionLocality iLocality, - const interaction_const_t &ic, - int forceFlags, - int clearF, - t_forcerec *fr, - gmx_enerdata_t *enerd, - t_nrnb *nrnb); - #endif // GMX_NBNXN_NBNXN_H diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index eef7c9a3ed..7b14954447 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -292,18 +292,57 @@ pick_nbnxn_kernel(const gmx::MDLogger &mdlog, } // namespace Nbnxm -void nonbonded_verlet_t::initPairlistSets(const bool haveMultipleDomains) +nonbonded_verlet_t::PairlistSets::PairlistSets(const NbnxnListParameters &listParams, + const bool haveMultipleDomains, + const int minimumIlistCountForGpuBalancing) : + params_(listParams), + minimumIlistCountForGpuBalancing_(minimumIlistCountForGpuBalancing) { - pairlistSets_.emplace_back(*listParams); + localSet_ = std::make_unique(params_); + if (haveMultipleDomains) { - pairlistSets_.emplace_back(*listParams); + nonlocalSet_ = std::make_unique(params_); } } namespace Nbnxm { +/*! \brief Gets and returns the minimum i-list count for balacing based on the GPU used or env.var. when set */ +static int getMinimumIlistCountForGpuBalancing(gmx_nbnxn_gpu_t *nbnxmGpu) +{ + int minimumIlistCount; + + if (const char *env = getenv("GMX_NB_MIN_CI")) + { + char *end; + + minimumIlistCount = strtol(env, &end, 10); + if (!end || (*end != 0) || minimumIlistCount < 0) + { + gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, non-negative integer required", env); + } + + if (debug) + { + fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", + minimumIlistCount); + } + } + else + { + minimumIlistCount = gpu_min_ci_balanced(nbnxmGpu); + if (debug) + { + fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n", + minimumIlistCount); + } + } + + return minimumIlistCount; +} + void init_nb_verlet(const gmx::MDLogger &mdlog, nonbonded_verlet_t **nb_verlet, gmx_bool bFEP_NonBonded, @@ -342,22 +381,12 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, nonbondedResource, ir, fr->bNonbonded)); - const bool haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1); - - nbv->listParams = std::make_unique(nbv->kernelSetup().kernelType, - ir->rlist); - nbv->initPairlistSets(haveMultipleDomains); + const bool haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1); - nbv->min_ci_balanced = 0; + NbnxnListParameters listParams(nbv->kernelSetup().kernelType, ir->rlist); setupDynamicPairlistPruning(mdlog, ir, mtop, box, fr->ic, - nbv->listParams.get()); - - nbv->nbs = std::make_unique(ir->ePBC, - DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr, - DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr, - bFEP_NonBonded, - gmx_omp_nthreads_get(emntPairsearch)); + &listParams); int enbnxninitcombrule; if (fr->ic->vdwtype == evdwCUT && @@ -404,6 +433,7 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, mimimumNumEnergyGroupNonbonded, nbv->pairlistIsSimple() ? gmx_omp_nthreads_get(emntNonbonded) : 1); + int minimumIlistCountForGpuBalancing = 0; if (useGpu) { /* init the NxN GPU data; the last argument tells whether we'll have @@ -411,39 +441,25 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, gpu_init(&nbv->gpu_nbv, deviceInfo, fr->ic, - nbv->listParams.get(), + &listParams, nbv->nbat, cr->nodeid, haveMultipleDomains); - if (const char *env = getenv("GMX_NB_MIN_CI")) - { - char *end; - - nbv->min_ci_balanced = strtol(env, &end, 10); - if (!end || (*end != 0) || nbv->min_ci_balanced < 0) - { - gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, non-negative integer required", env); - } - - if (debug) - { - fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", - nbv->min_ci_balanced); - } - } - else - { - nbv->min_ci_balanced = gpu_min_ci_balanced(nbv->gpu_nbv); - if (debug) - { - fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n", - nbv->min_ci_balanced); - } - } - + minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(nbv->gpu_nbv); } + nbv->pairlistSets_ = + std::make_unique(listParams, + haveMultipleDomains, + minimumIlistCountForGpuBalancing); + + nbv->nbs = std::make_unique(ir->ePBC, + DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr, + DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr, + bFEP_NonBonded, + gmx_omp_nthreads_get(emntPairsearch)); + *nb_verlet = nbv; } diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp index 8365163e24..d2327f94bd 100644 --- a/src/gromacs/nbnxm/pairlist.cpp +++ b/src/gromacs/nbnxm/pairlist.cpp @@ -4026,16 +4026,18 @@ static void sort_sci(NbnxnPairlistGpu *nbl) std::swap(nbl->sci, work.sci_sort); } -void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, - const InteractionLocality iLocality, - nbnxn_pairlist_set_t *nbl_list, - const t_blocka *excl, - const int64_t step, - t_nrnb *nrnb) +void +nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality iLocality, + nbnxn_search *nbs, + nbnxn_atomdata_t *nbat, + const t_blocka *excl, + const Nbnxm::KernelType kernelType, + const int64_t step, + t_nrnb *nrnb) { - nbnxn_search *nbs = nbv->nbs.get(); - nbnxn_atomdata_t *nbat = nbv->nbat; - const real rlist = nbv->listParams->rlistOuter; + nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality); + + const real rlist = nbl_list->params.rlistOuter; int nsubpair_target; float nsubpair_tot_est; @@ -4071,9 +4073,9 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nzi = nbs->zones->nizone; } - if (!nbl_list->bSimple && nbv->min_ci_balanced > 0) + if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0) { - get_nsubpair_target(nbs, iLocality, rlist, nbv->min_ci_balanced, + get_nsubpair_target(nbs, iLocality, rlist, minimumIlistCountForGpuBalancing_, &nsubpair_target, &nsubpair_tot_est); } else @@ -4164,7 +4166,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nbnxn_make_pairlist_part(nbs, iGrid, jGrid, &nbs->work[th], nbat, *excl, rlist, - nbv->kernelSetup().kernelType, + kernelType, ci_block, nbat->bUseBufferFlags, nsubpair_target, @@ -4178,7 +4180,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nbnxn_make_pairlist_part(nbs, iGrid, jGrid, &nbs->work[th], nbat, *excl, rlist, - nbv->kernelSetup().kernelType, + kernelType, ci_block, nbat->bUseBufferFlags, nsubpair_target, @@ -4291,7 +4293,15 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty"); } - nbl_list->outerListCreationStep = step; + if (iLocality == Nbnxm::InteractionLocality::Local) + { + outerListCreationStep_ = step; + } + else + { + GMX_RELEASE_ASSERT(outerListCreationStep_ == step, + "Outer list should be created at the same step as the inner list"); + } /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */ if (iLocality == InteractionLocality::Local) @@ -4346,19 +4356,30 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, } } - if (nbv->listParams->useDynamicPruning && !nbv->useGpu()) + if (params_.useDynamicPruning && nbl_list->bSimple) { nbnxnPrepareListForDynamicPruning(nbl_list); } +} + +void +nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality iLocality, + const t_blocka *excl, + int64_t step, + t_nrnb *nrnb) +{ + pairlistSets_->construct(iLocality, nbs.get(), nbat, excl, + kernelSetup_.kernelType, + step, nrnb); - if (nbv->useGpu()) + if (useGpu()) { /* Launch the transfer of the pairlist to the GPU. * * NOTE: The launch overhead is currently not timed separately */ - Nbnxm::gpu_init_pairlist(nbv->gpu_nbv, - nbl_list->nblGpu[0], + Nbnxm::gpu_init_pairlist(gpu_nbv, + pairlistSets().pairlistSet(iLocality).nblGpu[0], iLocality); } } diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h index d54708150e..3f3bde3ad1 100644 --- a/src/gromacs/nbnxm/pairlist.h +++ b/src/gromacs/nbnxm/pairlist.h @@ -53,6 +53,7 @@ // to include it during OpenCL jitting without including config.h #include "gromacs/nbnxm/constants.h" +struct NbnxnListParameters; struct NbnxnPairlistCpuWork; struct NbnxnPairlistGpuWork; struct tMPI_Atomic; @@ -82,29 +83,6 @@ enum class PairlistType : int static constexpr gmx::EnumerationArray IClusterSizePerListType = { 4, 4, 4, 8 }; static constexpr gmx::EnumerationArray JClusterSizePerListType = { 2, 4, 8, 8 }; -/*! \cond INTERNAL */ - -/*! \brief The setup for generating and pruning the nbnxn pair list. - * - * Without dynamic pruning rlistOuter=rlistInner. - */ -struct NbnxnListParameters -{ - /*! \brief Constructor producing a struct with dynamic pruning disabled - */ - NbnxnListParameters(Nbnxm::KernelType kernelType, - real rlist); - - PairlistType pairlistType; //!< The type of cluster-pair list - bool useDynamicPruning; //!< Are we using dynamic pair-list pruning - int nstlistPrune; //!< Pair-list dynamic pruning interval - real rlistOuter; //!< Cut-off of the larger, outer pair-list - real rlistInner; //!< Cut-off of the smaller, inner pair-list - int numRollingParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning -}; - -/*! \endcond */ - /* With CPU kernels the i-cluster size is always 4 atoms. */ static constexpr int c_nbnxnCpuIClusterSize = 4; @@ -307,7 +285,6 @@ struct nbnxn_pairlist_set_t int natpair_lj; /* Total number of atom pairs for LJ kernel */ int natpair_q; /* Total number of atom pairs for Q kernel */ std::vector nbl_fep; /* List of free-energy atom pair interactions */ - int64_t outerListCreationStep; /* Step at which the outer list was created */ }; enum { diff --git a/src/gromacs/nbnxm/pairlistset.cpp b/src/gromacs/nbnxm/pairlistset.cpp index 18b0b12a5c..11473517df 100644 --- a/src/gromacs/nbnxm/pairlistset.cpp +++ b/src/gromacs/nbnxm/pairlistset.cpp @@ -90,18 +90,4 @@ nbnxn_pairlist_set_t::nbnxn_pairlist_set_t(const NbnxnListParameters &listParams nbnxn_init_pairlist_set(this); } -int nbnxnNumStepsWithPairlist(const nonbonded_verlet_t &nbv, - const Nbnxm::InteractionLocality iLocality, - const int64_t step) -{ - return step - nbv.pairlistSet(iLocality).outerListCreationStep; -} - -bool nbnxnIsDynamicPairlistPruningStep(const nonbonded_verlet_t &nbv, - const Nbnxm::InteractionLocality iLocality, - const int64_t step) -{ - return nbnxnNumStepsWithPairlist(nbv, iLocality, step) % nbv.listParams->nstlistPrune == 0; -} - /*! \endcond */ diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h index 82bcdef6e1..d550537f5b 100644 --- a/src/gromacs/nbnxm/pairlistset.h +++ b/src/gromacs/nbnxm/pairlistset.h @@ -37,13 +37,13 @@ #define GMX_NBNXM_PAIRLISTSET_H #include "gromacs/math/vectypes.h" +#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/pairlist.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" #include "locality.h" -struct nbnxn_pairlist_set_t; - /* Initializes a set of pair lists stored in nbnxn_pairlist_set_t * * TODO: Merge into the constructor diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp index 8caddc494e..6ae794fbff 100644 --- a/src/gromacs/nbnxm/prunekerneldispatch.cpp +++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp @@ -45,12 +45,15 @@ #include "kernels_simd_4xm/kernel_prune.h" -void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t *nbl_lists, - const Nbnxm::KernelType kernelType, - const nbnxn_atomdata_t *nbat, - const rvec *shift_vec) +void +nonbonded_verlet_t::PairlistSets::dispatchPruneKernel(const Nbnxm::InteractionLocality iLocality, + const nbnxn_atomdata_t *nbat, + const rvec *shift_vec, + const Nbnxm::KernelType kernelType) { - const real rlistInner = nbl_lists->params.rlistInner; + nbnxn_pairlist_set_t *nbl_lists = &pairlistSet(iLocality); + + const real rlistInner = nbl_lists->params.rlistInner; GMX_ASSERT(nbl_lists->nbl[0]->ciOuter.size() >= nbl_lists->nbl[0]->ci.size(), "Here we should either have an empty ci list or ciOuter should be >= ci"); @@ -77,3 +80,10 @@ void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t *nbl_lists, } } } + +void +nonbonded_verlet_t::dispatchPruneKernel(const Nbnxm::InteractionLocality iLocality, + const rvec *shift_vec) +{ + pairlistSets_->dispatchPruneKernel(iLocality, nbat, shift_vec, kernelSetup_.kernelType); +} -- 2.22.0