From: Berk Hess Date: Tue, 15 Jan 2019 08:35:04 +0000 (+0100) Subject: Use enum class for nbnxm locality X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=3377895771dee21f7ef2056708bd5af9cc02db26;p=alexxy%2Fgromacs.git Use enum class for nbnxm locality Converted the interaction and atom locality enum to enum class. This exposed and fixed a few mixings of the two enums. Move some files into the Nbnxm namespace. Add local/non-local organization to the GPU timers. Note that the use of the Nbnxm namespace outside the nbnxm module is only temporary. This should all be replaced by methods. Change-Id: I3c891b3b2b3e14d8175b63f4191f365a5cd64b18 --- diff --git a/src/gromacs/domdec/partition.cpp b/src/gromacs/domdec/partition.cpp index be726af141..83ffe74855 100644 --- a/src/gromacs/domdec/partition.cpp +++ b/src/gromacs/domdec/partition.cpp @@ -3422,7 +3422,7 @@ void dd_partition_system(FILE *fplog, fr->cginfo, state_local->x, ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr, - fr->nbv->grp[eintLocal].kernel_type, + fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type, fr->nbv->nbat); nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]); diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp index 579fc3c7f9..778c7c2c48 100644 --- a/src/gromacs/ewald/pme_load_balancing.cpp +++ b/src/gromacs/ewald/pme_load_balancing.cpp @@ -825,7 +825,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, /* We always re-initialize the tables whether they are used or not */ init_interaction_const_tables(nullptr, ic, rtab); - nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams); + Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, listParams); if (!pme_lb->bSepPMERanks) { diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 26a6a50a5c..1dda3a2c67 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -2635,15 +2635,15 @@ void init_forcerec(FILE *fp, GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical"); } - init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, - cr, hardwareInfo, deviceInfo, - mtop, box); + Nbnxm::init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, + cr, hardwareInfo, deviceInfo, + mtop, box); if (useGpuForBonded) { auto stream = DOMAINDECOMP(cr) ? - nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintNonlocal) : - nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintLocal); + Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) : + Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local); // TODO the heap allocation is only needed while // t_forcerec lacks a constructor. fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams, @@ -2685,7 +2685,7 @@ void free_gpu_resources(t_forcerec *fr, if (isPPrankUsingGPU) { /* free nbnxn data in GPU memory */ - nbnxn_gpu_free(fr->nbv->gpu_nbv); + Nbnxm::gpu_free(fr->nbv->gpu_nbv); delete fr->gpuBonded; fr->gpuBonded = nullptr; } diff --git a/src/gromacs/mdlib/resethandler.cpp b/src/gromacs/mdlib/resethandler.cpp index 5269dfde48..5697378346 100644 --- a/src/gromacs/mdlib/resethandler.cpp +++ b/src/gromacs/mdlib/resethandler.cpp @@ -175,7 +175,7 @@ bool ResetHandler::resetCountersImpl( if (use_GPU(nbv)) { - nbnxn_gpu_reset_timings(nbv); + Nbnxm::gpu_reset_timings(nbv); } if (pme_gpu_task_enabled(pme)) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 2fd4038c55..6dedbb795b 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -387,14 +387,15 @@ static void post_process_forces(const t_commrec *cr, } } -static void do_nb_verlet(t_forcerec *fr, - const interaction_const_t *ic, - gmx_enerdata_t *enerd, - int flags, int ilocality, - int clearF, - int64_t step, - t_nrnb *nrnb, - gmx_wallcycle_t wcycle) +static void do_nb_verlet(t_forcerec *fr, + const interaction_const_t *ic, + gmx_enerdata_t *enerd, + const int flags, + const Nbnxm::InteractionLocality ilocality, + const int clearF, + const int64_t step, + t_nrnb *nrnb, + gmx_wallcycle_t wcycle) { if (!(flags & GMX_FORCE_NONBONDED)) { @@ -859,11 +860,12 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv { GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check; wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L); - isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv, - flags, eatLocal, - haveOtherWork, - enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], - fshift, completionType); + isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv, + flags, + Nbnxm::AtomLocality::Local, + haveOtherWork, + enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], + fshift, completionType); wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); // To get the call count right, when the task finished we // issue a start/stop. @@ -874,7 +876,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv wallcycle_start(wcycle, ewcWAIT_GPU_NB_L); wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal, + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local, nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle); } } @@ -905,15 +907,15 @@ static inline void launchGpuRollingPruning(const t_commrec *cr, */ int numRollingParts = nbv->listParams->numRollingParts; GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency"); - int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep; + int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep; bool stepIsEven = ((stepWithCurrentList & 1) == 0); if (stepWithCurrentList > 0 && stepWithCurrentList < inputrec->nstlist - 1 && (stepIsEven || DOMAINDECOMP(cr))) { - nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv, - stepIsEven ? eintLocal : eintNonlocal, - numRollingParts); + Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv, + stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal, + numRollingParts); } } @@ -1084,7 +1086,7 @@ static void do_force_cutsVERLET(FILE *fplog, nullptr, 0, mdatoms->homenr, -1, fr->cginfo, x.unpaddedArrayRef(), 0, nullptr, - nbv->grp[eintLocal].kernel_type, + nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type, nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL); } @@ -1093,7 +1095,7 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL); nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef(), - nbv->grp[eintNonlocal].kernel_type, + nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type, nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL); } @@ -1111,10 +1113,10 @@ static void do_force_cutsVERLET(FILE *fplog, if (bNS) { - nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat); + Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat); } - nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat); + Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); @@ -1130,9 +1132,9 @@ static void do_force_cutsVERLET(FILE *fplog, // higher-level object than the nb module. fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()), top->idef, - nbnxn_gpu_get_xq(nbv->gpu_nbv), - nbnxn_gpu_get_f(nbv->gpu_nbv), - nbnxn_gpu_get_fshift(nbv->gpu_nbv)); + Nbnxm::gpu_get_xq(nbv->gpu_nbv), + Nbnxm::gpu_get_f(nbv->gpu_nbv), + Nbnxm::gpu_get_fshift(nbv->gpu_nbv)); ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions(); } @@ -1142,35 +1144,38 @@ static void do_force_cutsVERLET(FILE *fplog, /* do local pair search */ if (bNS) { + nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists; + wallcycle_start_nocount(wcycle, ewcNS); wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL); nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat, &top->excls, nbv->listParams->rlistOuter, nbv->min_ci_balanced, - &nbv->grp[eintLocal].nbl_lists, - eintLocal, - nbv->grp[eintLocal].kernel_type, + &pairlistSet, + Nbnxm::InteractionLocality::Local, + nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type, nrnb); - nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step; + pairlistSet.outerListCreationStep = step; if (nbv->listParams->useDynamicPruning && !bUseGPU) { - nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists); + nbnxnPrepareListForDynamicPruning(&pairlistSet); } wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL); if (bUseGPU) { /* initialize local pair-list on the GPU */ - nbnxn_gpu_init_pairlist(nbv->gpu_nbv, - nbv->grp[eintLocal].nbl_lists.nblGpu[0], - eintLocal); + Nbnxm::gpu_init_pairlist(nbv->gpu_nbv, + pairlistSet.nblGpu[0], + Nbnxm::InteractionLocality::Local); } wallcycle_stop(wcycle, ewcNS); } else { - nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()), + nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local, + FALSE, as_rvec_array(x.unpaddedArrayRef().data()), nbv->nbat, wcycle); } @@ -1184,7 +1189,7 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_start(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); - nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork); + Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); // bonded work not split into separate local and non-local, so with DD @@ -1198,7 +1203,7 @@ static void do_force_cutsVERLET(FILE *fplog, /* launch local nonbonded work on GPU */ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); - do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); wallcycle_stop(wcycle, ewcLAUNCH_GPU); @@ -1217,6 +1222,8 @@ static void do_force_cutsVERLET(FILE *fplog, do non-local pair search */ if (DOMAINDECOMP(cr)) { + nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists; + if (bNS) { wallcycle_start_nocount(wcycle, ewcNS); @@ -1226,23 +1233,23 @@ static void do_force_cutsVERLET(FILE *fplog, &top->excls, nbv->listParams->rlistOuter, nbv->min_ci_balanced, - &nbv->grp[eintNonlocal].nbl_lists, - eintNonlocal, - nbv->grp[eintNonlocal].kernel_type, + &pairlistSet, + Nbnxm::InteractionLocality::NonLocal, + nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type, nrnb); - nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step; + pairlistSet.outerListCreationStep = step; if (nbv->listParams->useDynamicPruning && !bUseGPU) { - nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists); + nbnxnPrepareListForDynamicPruning(&pairlistSet); } wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); - if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU) + if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU) { /* initialize non-local pair-list on the GPU */ - nbnxn_gpu_init_pairlist(nbv->gpu_nbv, - nbv->grp[eintNonlocal].nbl_lists.nblGpu[0], - eintNonlocal); + Nbnxm::gpu_init_pairlist(nbv->gpu_nbv, + pairlistSet.nblGpu[0], + Nbnxm::InteractionLocality::NonLocal); } wallcycle_stop(wcycle, ewcNS); } @@ -1250,7 +1257,8 @@ static void do_force_cutsVERLET(FILE *fplog, { dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); - nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()), + nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal, + FALSE, as_rvec_array(x.unpaddedArrayRef().data()), nbv->nbat, wcycle); } @@ -1260,7 +1268,7 @@ static void do_force_cutsVERLET(FILE *fplog, /* launch non-local nonbonded tasks on GPU */ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); - nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork); + Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (ppForceWorkload->haveGpuBondedWork) @@ -1271,7 +1279,7 @@ static void do_force_cutsVERLET(FILE *fplog, } wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); - do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); @@ -1286,11 +1294,11 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (DOMAINDECOMP(cr)) { - nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, - flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork); + Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, + flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork); } - nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, - flags, eatLocal, ppForceWorkload->haveGpuBondedWork); + Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, + flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY)) @@ -1397,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog, if (!bUseOrEmulGPU) { - do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle); } @@ -1406,18 +1414,18 @@ static void do_force_cutsVERLET(FILE *fplog, /* Calculate the local and non-local free energy interactions here. * Happens here on the CPU both with and without GPU. */ - if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0) + if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0) { - do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists, + do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists, fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms, inputrec->fepvals, lambda, enerd, flags, nrnb, wcycle); } if (DOMAINDECOMP(cr) && - fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0) + fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0) { - do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists, + do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists, fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms, inputrec->fepvals, lambda, enerd, flags, nrnb, wcycle); @@ -1426,22 +1434,14 @@ static void do_force_cutsVERLET(FILE *fplog, if (!bUseOrEmulGPU) { - int aloc; - if (DOMAINDECOMP(cr)) { - do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); } - if (!bUseOrEmulGPU) - { - aloc = eintLocal; - } - else - { - aloc = eintNonlocal; - } + const Nbnxm::InteractionLocality iloc = + (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal); /* Add all the non-bonded force to the normal force array. * This can be split into a local and a non-local part when overlapping @@ -1449,13 +1449,13 @@ static void do_force_cutsVERLET(FILE *fplog, */ wallcycle_stop(wcycle, ewcFORCE); - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle); + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle); wallcycle_start_nocount(wcycle, ewcFORCE); /* if there are multiple fshift output buffers reduce them */ if ((flags & GMX_FORCE_VIRIAL) && - nbv->grp[aloc].nbl_lists.nnbl > 1) + nbv->grp[iloc].nbl_lists.nnbl > 1) { /* This is not in a subcounter because it takes a negligible and constant-sized amount of time */ @@ -1493,25 +1493,25 @@ static void do_force_cutsVERLET(FILE *fplog, if (bUseGPU) { wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL); - nbnxn_gpu_wait_finish_task(nbv->gpu_nbv, - flags, eatNonlocal, - ppForceWorkload->haveGpuBondedWork, - enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], - fr->fshift); + Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, + flags, Nbnxm::AtomLocality::NonLocal, + ppForceWorkload->haveGpuBondedWork, + enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], + fr->fshift); cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL); } else { wallcycle_start_nocount(wcycle, ewcFORCE); - do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle); wallcycle_stop(wcycle, ewcFORCE); } /* skip the reduction if there was no non-local work to do */ - if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty()) + if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty()) { - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal, + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal, nbv->nbat, f, wcycle); } } @@ -1558,10 +1558,10 @@ static void do_force_cutsVERLET(FILE *fplog, const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L); - nbnxn_gpu_wait_finish_task(nbv->gpu_nbv, - flags, eatLocal, ppForceWorkload->haveGpuBondedWork, - enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], - fr->fshift); + Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, + flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork, + enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR], + fr->fshift); float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes) @@ -1586,7 +1586,7 @@ static void do_force_cutsVERLET(FILE *fplog, // NOTE: emulation kernel is not included in the balancing region, // but emulation mode does not target performance anyway wallcycle_start_nocount(wcycle, ewcFORCE); - do_nb_verlet(fr, ic, enerd, flags, eintLocal, + do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes, step, nrnb, wcycle); wallcycle_stop(wcycle, ewcFORCE); @@ -1602,7 +1602,7 @@ static void do_force_cutsVERLET(FILE *fplog, /* now clear the GPU outputs while we finish the step on the CPU */ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); - nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags); + Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags); /* Is dynamic pair-list pruning activated? */ if (nbv->listParams->useDynamicPruning) @@ -1633,7 +1633,7 @@ static void do_force_cutsVERLET(FILE *fplog, * on the non-alternating path. */ if (bUseOrEmulGPU && !alternateGpuWait) { - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal, + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local, nbv->nbat, f, wcycle); } if (DOMAINDECOMP(cr)) @@ -2744,7 +2744,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr, if (printReport) { - auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr; + auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr; gmx_wallclock_gpu_pme_t pme_gpu_timings = {}; if (pme_gpu_task_enabled(pme)) { diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index db4ad05dc6..62ba0c4014 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1004,12 +1004,12 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox, } /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */ -void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, - int locality, - gmx_bool FillLocal, - rvec *x, - nbnxn_atomdata_t *nbat, - gmx_wallcycle *wcycle) +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, + const Nbnxm::AtomLocality locality, + gmx_bool FillLocal, + rvec *x, + nbnxn_atomdata_t *nbat, + gmx_wallcycle *wcycle) { wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS); @@ -1019,15 +1019,16 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, switch (locality) { - case eatAll: + case Nbnxm::AtomLocality::All: + case Nbnxm::AtomLocality::Count: g0 = 0; g1 = nbs->grid.size(); break; - case eatLocal: + case Nbnxm::AtomLocality::Local: g0 = 0; g1 = 1; break; - case eatNonlocal: + case Nbnxm::AtomLocality::NonLocal: g0 = 1; g1 = nbs->grid.size(); break; @@ -1467,11 +1468,11 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, } /* Add the force array(s) from nbnxn_atomdata_t to f */ -void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, - int locality, - nbnxn_atomdata_t *nbat, - rvec *f, - gmx_wallcycle *wcycle) +void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, + const Nbnxm::AtomLocality locality, + nbnxn_atomdata_t *nbat, + rvec *f, + gmx_wallcycle *wcycle) { wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); @@ -1482,15 +1483,16 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, switch (locality) { - case eatAll: + case Nbnxm::AtomLocality::All: + case Nbnxm::AtomLocality::Count: a0 = 0; na = nbs->natoms_nonlocal; break; - case eatLocal: + case Nbnxm::AtomLocality::Local: a0 = 0; na = nbs->natoms_local; break; - case eatNonlocal: + case Nbnxm::AtomLocality::NonLocal: a0 = nbs->natoms_local; na = nbs->natoms_nonlocal - nbs->natoms_local; break; @@ -1500,7 +1502,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, if (nbat->out.size() > 1) { - if (locality != eatAll) + if (locality != Nbnxm::AtomLocality::All) { gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll"); } diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index ebfeee5bfb..bd210ea014 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -42,6 +42,8 @@ #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" +#include "locality.h" + namespace gmx { class MDLogger; @@ -94,7 +96,7 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool dynamic_box, * FillLocal tells if the local filler particle coordinates should be zeroed. */ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, - int locality, + Nbnxm::AtomLocality locality, gmx_bool FillLocal, rvec *x, nbnxn_atomdata_t *nbat, @@ -102,7 +104,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs, /* Add the forces stored in nbat to f, zeros the forces in nbat */ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs, - int locality, + Nbnxm::AtomLocality locality, nbnxn_atomdata_t *nbat, rvec *f, gmx_wallcycle *wcycle); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 15eb634f28..7ce9b9fa5e 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -107,6 +107,9 @@ #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */ +namespace Nbnxm +{ + /*! Nonbonded kernel function pointer type */ typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const cu_nbparam_t, @@ -268,19 +271,24 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g } /*! \brief Launch asynchronously the xq buffer host to device copy. */ -void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, - const nbnxn_atomdata_t *nbatom, - int iloc, - bool haveOtherWork) +void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, + const nbnxn_atomdata_t *nbatom, + const AtomLocality atomLocality, + const bool haveOtherWork) { - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ + GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, + "Only local and non-local xq transfers are supported"); - cu_atomdata_t *adat = nb->atdat; - cu_plist_t *plist = nb->plist[iloc]; - cu_timers_t *t = nb->timers; - cudaStream_t stream = nb->stream[iloc]; + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - bool bDoTime = nb->bDoTime; + int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ + + cu_atomdata_t *adat = nb->atdat; + cu_plist_t *plist = nb->plist[iloc]; + cu_timers_t *t = nb->timers; + cudaStream_t stream = nb->stream[iloc]; + + bool bDoTime = nb->bDoTime; /* Don't launch the non-local H2D copy if there is no dependent work to do: neither non-local nor other (e.g. bonded) work @@ -291,7 +299,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, we always call the local local x+q copy (and the rest of the local work in nbnxn_gpu_launch_kernel(). */ - if (!haveOtherWork && canSkipWork(nb, iloc)) + if (!haveOtherWork && canSkipWork(*nb, iloc)) { plist->haveFreshList = false; @@ -299,7 +307,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, } /* calculate the atom data index range based on locality */ - if (LOCAL_I(iloc)) + if (atomLocality == AtomLocality::Local) { adat_begin = 0; adat_len = adat->natoms_local; @@ -313,7 +321,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, /* beginning of timed HtoD section */ if (bDoTime) { - t->nb_h2d[iloc].openTimingRegion(stream); + t->xf[atomLocality].nb_h2d.openTimingRegion(stream); } /* HtoD x, q */ @@ -323,7 +331,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - t->nb_h2d[iloc].closeTimingRegion(stream); + t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); } /* When we get here all misc operations issued in the local stream as well as @@ -334,7 +342,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, */ if (nb->bUseTwoStreams) { - if (iloc == eintLocal) + if (iloc == InteractionLocality::Local) { cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream); CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed"); @@ -364,9 +372,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, the local x+q H2D (and all preceding) tasks are complete and synchronize with this event in the non-local stream before launching the non-bonded kernel. */ -void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, - int flags, - int iloc) +void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, + const int flags, + const InteractionLocality iloc) { /* CUDA kernel launch-related stuff */ int nblock; @@ -392,7 +400,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ - if (canSkipWork(nb, iloc)) + if (canSkipWork(*nb, iloc)) { plist->haveFreshList = false; @@ -405,7 +413,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, (TODO: ATM that's the way the timing accounting can distinguish between separate prune kernel and combined force+prune, maybe we need a better way?). */ - nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1); + gpu_launch_kernel_pruneonly(nb, iloc, 1); } if (plist->nsci == 0) @@ -417,14 +425,14 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->nb_k[iloc].openTimingRegion(stream); + t->interaction[iloc].nb_k.openTimingRegion(stream); } /* get the pointer to the kernel flavor we need to use */ nb_kernel = select_nbnxn_kernel(nbp->eeltype, nbp->vdwtype, bCalcEner, - (plist->haveFreshList && !nb->timers->didPrune[iloc]), + (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->dev_info); /* Kernel launch config: @@ -458,13 +466,13 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, config.sharedMemorySize); } - auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr; + auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift); launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs); if (bDoTime) { - t->nb_k[iloc].closeTimingRegion(stream); + t->interaction[iloc].nb_k.closeTimingRegion(stream); } if (GMX_NATIVE_WINDOWS) @@ -487,9 +495,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z) return shmem; } -void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, - int iloc, - int numParts) +void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, + const InteractionLocality iloc, + const int numParts) { cu_atomdata_t *adat = nb->atdat; cu_nbparam_t *nbp = nb->nbparam; @@ -544,7 +552,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, GpuRegionTimer *timer = nullptr; if (bDoTime) { - timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]); + timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k); } /* beginning of timed prune calculation section */ @@ -589,14 +597,14 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, (combined or separate 1st pass prune, rolling prune). */ if (plist->haveFreshList) { - plist->haveFreshList = false; + plist->haveFreshList = false; /* Mark that pruning has been done */ - nb->timers->didPrune[iloc] = true; + nb->timers->interaction[iloc].didPrune = true; } else { /* Mark that rolling pruning has been done */ - nb->timers->didRollingPrune[iloc] = true; + nb->timers->interaction[iloc].didRollingPrune = true; } if (bDoTime) @@ -611,18 +619,19 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, } } -void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, - nbnxn_atomdata_t *nbatom, - int flags, - int aloc, - bool haveOtherWork) +void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, + nbnxn_atomdata_t *nbatom, + const int flags, + const AtomLocality atomLocality, + const bool haveOtherWork) { cudaError_t stat; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* determine interaction locality from atom locality */ - int iloc = gpuAtomToInteractionLocality(aloc); + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + /* extract the data */ cu_atomdata_t *adat = nb->atdat; cu_timers_t *t = nb->timers; bool bDoTime = nb->bDoTime; @@ -632,22 +641,22 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, bool bCalcFshift = flags & GMX_FORCE_VIRIAL; /* don't launch non-local copy-back if there was no non-local work to do */ - if (!haveOtherWork && canSkipWork(nb, iloc)) + if (!haveOtherWork && canSkipWork(*nb, iloc)) { return; } - getGpuAtomRange(adat, aloc, &adat_begin, &adat_len); + getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len); /* beginning of timed D2H section */ if (bDoTime) { - t->nb_d2h[iloc].openTimingRegion(stream); + t->xf[atomLocality].nb_d2h.openTimingRegion(stream); } /* With DD the local D2H transfer can only start after the non-local kernel has finished. */ - if (iloc == eintLocal && nb->bUseTwoStreams) + if (iloc == InteractionLocality::Local && nb->bUseTwoStreams) { stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0); CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed"); @@ -661,14 +670,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, recorded which signals that the local D2H can proceed. This event is not placed after the non-local kernel because we want the non-local data back first. */ - if (iloc == eintNonlocal) + if (iloc == InteractionLocality::NonLocal) { stat = cudaEventRecord(nb->nonlocal_done, stream); CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed"); } /* only transfer energies in the local stream */ - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { /* DtoH fshift */ if (bCalcFshift) @@ -689,11 +698,11 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, if (bDoTime) { - t->nb_d2h[iloc].closeTimingRegion(stream); + t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); } } -void nbnxn_cuda_set_cacheconfig() +void cuda_set_cacheconfig() { cudaError_t stat; @@ -710,3 +719,5 @@ void nbnxn_cuda_set_cacheconfig() } } } + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h index f732da87d7..42296f62b2 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h @@ -36,10 +36,15 @@ * \brief * Declares nbnxn cuda cache and texture helper functions */ -#ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H -#define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H +#ifndef GMX_NBNXN_CUDA_NBNXN_CUDA_H +#define GMX_NBNXN_CUDA_NBNXN_CUDA_H + +namespace Nbnxm +{ //! Set up the cache configuration for the non-bonded kernels. -void nbnxn_cuda_set_cacheconfig(); +void cuda_set_cacheconfig(); + +} // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 537098d479..f75b1c080f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -44,6 +44,11 @@ #include #include +// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t +// is currently declared means this has to be before gpu_types.h +#include "nbnxm_cuda_types.h" + +// TODO Remove this comment when the above order issue is resolved #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/pmalloc_cuda.h" @@ -63,7 +68,9 @@ #include "gromacs/utility/smalloc.h" #include "nbnxm_cuda.h" -#include "nbnxm_cuda_types.h" + +namespace Nbnxm +{ /* This is a heuristically determined parameter for the Kepler * and Maxwell architectures for the minimum size of ci lists by multiplying @@ -330,11 +337,11 @@ static void init_nbparam(cu_nbparam_t *nbp, /*! Re-generate the GPU Ewald force table, resets rlist, and update the * electrostatic type switching to twin cut-off (or back) if needed. */ -void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, - const interaction_const_t *ic, - const NbnxnListParameters *listParams) +void gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, + const interaction_const_t *ic, + const NbnxnListParameters *listParams) { - if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU) + if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU) { return; } @@ -371,18 +378,6 @@ static void init_plist(cu_plist_t *pl) pl->haveFreshList = false; } -/*! Initializes the timer data structure. */ -static void init_timers(cu_timers_t *t, bool bUseTwoStreams) -{ - /* The non-local counters/stream (second in the array) are needed only with DD. */ - for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++) - { - t->didPairlistH2D[i] = false; - t->didPrune[i] = false; - t->didRollingPrune[i] = false; - } -} - /*! Initializes the timings data structure. */ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) { @@ -408,10 +403,10 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) } /*! Initializes simulation constant data. */ -static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb, - const interaction_const_t *ic, - const NbnxnListParameters *listParams, - const nbnxn_atomdata_t::Params &nbatParams) +static void cuda_init_const(gmx_nbnxn_cuda_t *nb, + const interaction_const_t *ic, + const NbnxnListParameters *listParams, + const nbnxn_atomdata_t::Params &nbatParams) { init_atomdata_first(nb->atdat, nbatParams.numTypes); init_nbparam(nb->nbparam, ic, listParams, nbatParams); @@ -420,13 +415,13 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb, nbnxn_cuda_clear_e_fshift(nb); } -void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, - const gmx_device_info_t *deviceInfo, - const interaction_const_t *ic, - const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat, - int /*rank*/, - gmx_bool bLocalAndNonlocal) +void gpu_init(gmx_nbnxn_cuda_t **p_nb, + const gmx_device_info_t *deviceInfo, + const interaction_const_t *ic, + const NbnxnListParameters *listParams, + const nbnxn_atomdata_t *nbat, + int /*rank*/, + gmx_bool bLocalAndNonlocal) { cudaError_t stat; gmx_nbnxn_cuda_t *nb; @@ -439,10 +434,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, snew(nb, 1); snew(nb->atdat, 1); snew(nb->nbparam, 1); - snew(nb->plist[eintLocal], 1); + snew(nb->plist[InteractionLocality::Local], 1); if (bLocalAndNonlocal) { - snew(nb->plist[eintNonlocal], 1); + snew(nb->plist[InteractionLocality::NonLocal], 1); } nb->bUseTwoStreams = bLocalAndNonlocal; @@ -455,17 +450,17 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el)); pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift)); - init_plist(nb->plist[eintLocal]); + init_plist(nb->plist[InteractionLocality::Local]); /* set device info, just point it to the right GPU among the detected ones */ nb->dev_info = deviceInfo; /* local/non-local GPU streams */ - stat = cudaStreamCreate(&nb->stream[eintLocal]); - CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed"); + stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]); + CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed"); if (nb->bUseTwoStreams) { - init_plist(nb->plist[eintNonlocal]); + init_plist(nb->plist[InteractionLocality::NonLocal]); /* Note that the device we're running on does not have to support * priorities, because we are querying the priority range which in this @@ -475,10 +470,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority); CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed"); - stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal], + stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal], cudaStreamDefault, highest_priority); - CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed"); + CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed"); } /* init events for sychronization (timing disabled for performance reasons!) */ @@ -495,15 +490,14 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, if (nb->bDoTime) { - init_timers(nb->timers, nb->bUseTwoStreams); init_timings(nb->timings); } /* set the kernel type for the current GPU */ /* pick L1 cache configuration */ - nbnxn_cuda_set_cacheconfig(); + cuda_set_cacheconfig(); - nbnxn_cuda_init_const(nb, ic, listParams, nbat->params()); + cuda_init_const(nb, ic, listParams, nbat->params()); *p_nb = nb; @@ -513,9 +507,9 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, } } -void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, - const NbnxnPairlistGpu *h_plist, - int iloc) +void gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, + const NbnxnPairlistGpu *h_plist, + const InteractionLocality iloc) { char sbuf[STRLEN]; bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); @@ -536,10 +530,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, } } + gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc]; + if (bDoTime) { - nb->timers->pl_h2d[iloc].openTimingRegion(stream); - nb->timers->didPairlistH2D[iloc] = true; + iTimers.pl_h2d.openTimingRegion(stream); + iTimers.didPairlistH2D = true; } Context context = nullptr; @@ -548,13 +544,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, &d_plist->nsci, &d_plist->sci_nalloc, context); copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context); copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit, &d_plist->nimask, &d_plist->imask_nalloc, context); @@ -563,22 +559,22 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, &d_plist->nexcl, &d_plist->excl_nalloc, context); copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - nb->timers->pl_h2d[iloc].closeTimingRegion(stream); + iTimers.pl_h2d.closeTimingRegion(stream); } /* the next use of thist list we be the first one, so we need to prune */ d_plist->haveFreshList = true; } -void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb, - const nbnxn_atomdata_t *nbatom) +void gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb, + const nbnxn_atomdata_t *nbatom) { cu_atomdata_t *adat = nb->atdat; - cudaStream_t ls = nb->stream[eintLocal]; + cudaStream_t ls = nb->stream[InteractionLocality::Local]; /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) @@ -594,7 +590,7 @@ static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear) { cudaError_t stat; cu_atomdata_t *adat = nb->atdat; - cudaStream_t ls = nb->stream[eintLocal]; + cudaStream_t ls = nb->stream[InteractionLocality::Local]; stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls); CU_RET_ERR(stat, "cudaMemsetAsync on f falied"); @@ -605,7 +601,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb) { cudaError_t stat; cu_atomdata_t *adat = nb->atdat; - cudaStream_t ls = nb->stream[eintLocal]; + cudaStream_t ls = nb->stream[InteractionLocality::Local]; stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls); CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied"); @@ -615,7 +611,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb) CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied"); } -void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags) +void gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags) { nbnxn_cuda_clear_f(nb, nb->atdat->natoms); /* clear shift force array and energies if the outputs were @@ -626,8 +622,8 @@ void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags) } } -void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, - const nbnxn_atomdata_t *nbat) +void gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, + const nbnxn_atomdata_t *nbat) { cudaError_t stat; int nalloc, natoms; @@ -635,7 +631,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, bool bDoTime = nb->bDoTime; cu_timers_t *timers = nb->timers; cu_atomdata_t *d_atdat = nb->atdat; - cudaStream_t ls = nb->stream[eintLocal]; + cudaStream_t ls = nb->stream[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -714,7 +710,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam) } } -void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) +void gpu_free(gmx_nbnxn_cuda_t *nb) { cudaError_t stat; cu_atomdata_t *atdat; @@ -773,7 +769,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) freeDeviceBuffer(&atdat->lj_comb); /* Free plist */ - auto *plist = nb->plist[eintLocal]; + auto *plist = nb->plist[InteractionLocality::Local]; freeDeviceBuffer(&plist->sci); freeDeviceBuffer(&plist->cj4); freeDeviceBuffer(&plist->imask); @@ -781,7 +777,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) sfree(plist); if (nb->bUseTwoStreams) { - auto *plist_nl = nb->plist[eintNonlocal]; + auto *plist_nl = nb->plist[InteractionLocality::NonLocal]; freeDeviceBuffer(&plist_nl->sci); freeDeviceBuffer(&plist_nl->cj4); freeDeviceBuffer(&plist_nl->imask); @@ -811,12 +807,12 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) } //! This function is documented in the header file -gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb) +gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_cuda_t *nb) { return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } -void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) +void gpu_reset_timings(nonbonded_verlet_t* nbv) { if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime) { @@ -824,44 +820,46 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) } } -int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb) +int gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb) { return nb != nullptr ? gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0; } -gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb) +gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb) { return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN)); } -void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t *nb, - int iloc) +void *gpu_get_command_stream(gmx_nbnxn_gpu_t *nb, + const InteractionLocality iloc) { assert(nb); return static_cast(&nb->stream[iloc]); } -void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t *nb) +void *gpu_get_xq(gmx_nbnxn_gpu_t *nb) { assert(nb); return static_cast(nb->atdat->xq); } -void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t *nb) +void *gpu_get_f(gmx_nbnxn_gpu_t *nb) { assert(nb); return static_cast(nb->atdat->f); } -rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t *nb) +rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb) { assert(nb); return reinterpret_cast(nb->atdat->fshift); } + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 3d490c395a..5a8380f5b5 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -52,8 +52,10 @@ #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/nbnxm/gpu_types_common.h" +#include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/pairlist.h" #include "gromacs/timing/gpu_timing.h" +#include "gromacs/utility/enumerationhelpers.h" /*! \brief Macro definining default for the prune kernel's j4 processing concurrency. * @@ -197,26 +199,26 @@ struct cu_nbparam /** \internal * \brief Pair list data. */ -using cu_plist_t = gpu_plist; +using cu_plist_t = Nbnxm::gpu_plist; /** \internal * \brief Typedef of actual timer type. */ -typedef struct nbnxn_gpu_timers_t cu_timers_t; +typedef struct Nbnxm::gpu_timers_t cu_timers_t; /** \internal * \brief Main data structure for CUDA nonbonded force calculations. */ struct gmx_nbnxn_cuda_t { - const gmx_device_info_t *dev_info; /**< CUDA device information */ - bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */ - cu_atomdata_t *atdat; /**< atom data */ - cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */ - cu_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */ - nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */ - - cudaStream_t stream[2]; /**< local and non-local GPU streams */ + const gmx_device_info_t *dev_info; /**< CUDA device information */ + bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */ + cu_atomdata_t *atdat; /**< atom data */ + cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */ + gmx::EnumerationArray plist; /**< pair-list data structures (local and non-local) */ + nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */ + + gmx::EnumerationArray stream; /**< local and non-local GPU streams */ /** events used for synchronization */ cudaEvent_t nonlocal_done; /**< event triggered when the non-local non-bonded kernel diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index cd64c8ca1a..1624c56c8c 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -67,6 +67,9 @@ #include "gpu_common_utils.h" #include "nbnxm_gpu.h" +namespace Nbnxm +{ + /*! \brief Check that atom locality values are valid for the GPU module. * * In the GPU module atom locality "all" is not supported, the local and @@ -74,12 +77,16 @@ * * \param[in] atomLocality atom locality specifier */ -static inline void validateGpuAtomLocality(int atomLocality) +static inline void +validateGpuAtomLocality(const AtomLocality atomLocality) { std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only " - "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal); + "local (%d) or nonlocal (%d)", + static_cast(atomLocality), + static_cast(AtomLocality::Local), + static_cast(AtomLocality::NonLocal)); - GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str()); + GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str()); } /*! \brief Convert atom locality to interaction locality. @@ -90,18 +97,19 @@ static inline void validateGpuAtomLocality(int atomLocality) * \param[in] atomLocality Atom locality specifier * \returns Interaction locality corresponding to the atom locality passed. */ -static inline int gpuAtomToInteractionLocality(int atomLocality) +static inline InteractionLocality +gpuAtomToInteractionLocality(const AtomLocality atomLocality) { validateGpuAtomLocality(atomLocality); /* determine interaction locality from atom locality */ - if (LOCAL_A(atomLocality)) + if (atomLocality == AtomLocality::Local) { - return eintLocal; + return InteractionLocality::Local; } - else if (NONLOCAL_A(atomLocality)) + else if (atomLocality == AtomLocality::NonLocal) { - return eintNonlocal; + return InteractionLocality::NonLocal; } else { @@ -117,16 +125,17 @@ static inline int gpuAtomToInteractionLocality(int atomLocality) * \param[out] atomRangeLen Atom range length in the atom data array. */ template -static inline void getGpuAtomRange(const AtomDataT *atomData, - int atomLocality, - int *atomRangeBegin, - int *atomRangeLen) +static inline void +getGpuAtomRange(const AtomDataT *atomData, + const AtomLocality atomLocality, + int *atomRangeBegin, + int *atomRangeLen) { assert(atomData); validateGpuAtomLocality(atomLocality); /* calculate the atom data index range based on locality */ - if (LOCAL_A(atomLocality)) + if (atomLocality == AtomLocality::Local) { *atomRangeBegin = 0; *atomRangeLen = atomData->natoms_local; @@ -155,24 +164,27 @@ static inline void getGpuAtomRange(const AtomDataT *atomData, template static void countPruneKernelTime(GpuTimers *timers, gmx_wallclock_gpu_nbnxn_t *timings, - const int iloc) + const InteractionLocality iloc) { + gpu_timers_t::Interaction &iTimers = timers->interaction[iloc]; + // We might have not done any pruning (e.g. if we skipped with empty domains). - if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc]) + if (!iTimers.didPrune && + !iTimers.didRollingPrune) { return; } - if (timers->didPrune[iloc]) + if (iTimers.didPrune) { timings->pruneTime.c++; - timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime(); + timings->pruneTime.t += iTimers.prune_k.getLastRangeTime(); } - if (timers->didRollingPrune[iloc]) + if (iTimers.didRollingPrune) { timings->dynamicPruneTime.c++; - timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime(); + timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime(); } } @@ -195,16 +207,17 @@ static void countPruneKernelTime(GpuTimers *timers, * \param[out] fshift Pointer to the array of shift forces to accumulate into */ template -static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst, - int iLocality, - bool reduceEnergies, - bool reduceFshift, - real *e_lj, - real *e_el, - rvec *fshift) +static inline void +gpu_reduce_staged_outputs(const StagingData &nbst, + const InteractionLocality iLocality, + const bool reduceEnergies, + const bool reduceFshift, + real *e_lj, + real *e_el, + rvec *fshift) { /* add up energies and shift forces (only once at local F wait) */ - if (LOCAL_I(iLocality)) + if (iLocality == InteractionLocality::Local) { if (reduceEnergies) { @@ -244,12 +257,13 @@ static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst, * */ template -static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings, - GpuTimers *timers, - const GpuPairlist *plist, - int atomLocality, - bool didEnergyKernels, - bool doTiming) +static inline void +gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings, + GpuTimers *timers, + const GpuPairlist *plist, + AtomLocality atomLocality, + bool didEnergyKernels, + bool doTiming) { /* timing data accumulation */ if (!doTiming) @@ -258,10 +272,10 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin } /* determine interaction locality from atom locality */ - int iLocality = gpuAtomToInteractionLocality(atomLocality); + const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); /* only increase counter once (at local F wait) */ - if (LOCAL_I(iLocality)) + if (iLocality == InteractionLocality::Local) { timings->nb_c++; timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1; @@ -269,11 +283,11 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin /* kernel timings */ timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t += - timers->nb_k[iLocality].getLastRangeTime(); + timers->interaction[iLocality].nb_k.getLastRangeTime(); /* X/q H2D and F D2H timings */ - timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime(); - timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime(); + timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime(); + timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime(); /* Count the pruning kernel times for both cases:1st pass (at search step) and rolling pruning (if called at the previous step). @@ -284,39 +298,41 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin countPruneKernelTime(timers, timings, iLocality); /* only count atdat and pair-list H2D at pair-search step */ - if (timers->didPairlistH2D[iLocality]) + if (timers->interaction[iLocality].didPairlistH2D) { /* atdat transfer timing (add only once, at local F wait) */ - if (LOCAL_A(atomLocality)) + if (atomLocality == AtomLocality::Local) { timings->pl_h2d_c++; timings->pl_h2d_t += timers->atdat.getLastRangeTime(); } - timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime(); + timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime(); /* Clear the timing flag for the next step */ - timers->didPairlistH2D[iLocality] = false; + timers->interaction[iLocality].didPairlistH2D = false; } } //TODO: move into shared source file with gmx_compile_cpp_as_cuda //NOLINTNEXTLINE(misc-definitions-in-headers) -bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, - int flags, - int aloc, - bool haveOtherWork, - real *e_lj, - real *e_el, - rvec *fshift, - GpuTaskCompletion completionKind) +bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, + const int flags, + const AtomLocality aloc, + const bool haveOtherWork, + real *e_lj, + real *e_el, + rvec *fshift, + GpuTaskCompletion completionKind) { + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + /* determine interaction locality from atom locality */ - int iLocality = gpuAtomToInteractionLocality(aloc); + const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc); // We skip when during the non-local phase there was actually no work to do. // This is consistent with nbnxn_gpu_launch_kernel. - if (haveOtherWork || !canSkipWork(nb, iLocality)) + if (haveOtherWork || !canSkipWork(*nb, iLocality)) { // Query the state of the GPU stream and return early if we're not done if (completionKind == GpuTaskCompletion::Check) @@ -336,14 +352,14 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, bool calcEner = (flags & GMX_FORCE_ENERGY) != 0; bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0; - nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, - nb->bDoTime != 0); + gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, + nb->bDoTime != 0); - nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift); + gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift); } /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */ - nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false; + nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false; /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */ nb->plist[iLocality]->haveFreshList = false; @@ -368,16 +384,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, * \param[out] fshift Pointer to the shift force buffer to accumulate into */ //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file -void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, - int flags, - int aloc, - bool haveOtherWork, - real *e_lj, - real *e_el, - rvec *fshift) +void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, + int flags, + AtomLocality aloc, + bool haveOtherWork, + real *e_lj, + real *e_el, + rvec *fshift) { - nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift, - GpuTaskCompletion::Wait); + gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift, + GpuTaskCompletion::Wait); } +} // namespace Nbnxm + #endif diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h index 40bffbb7b3..02febb47ab 100644 --- a/src/gromacs/nbnxm/gpu_common_utils.h +++ b/src/gromacs/nbnxm/gpu_common_utils.h @@ -54,6 +54,9 @@ #include "opencl/nbnxm_ocl_types.h" #endif +namespace Nbnxm +{ + /*! \brief An early return condition for empty NB GPU workloads * * This is currently used for non-local kernels/transfers only. @@ -61,10 +64,14 @@ * local part of the force array also depends on the non-local kernel. * The skip of the local kernel is taken care of separately. */ -static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc) +static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb, + InteractionLocality iloc) { - assert(nb && nb->plist[iloc]); - return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0); + assert(nb.plist[iloc]); + return (iloc == InteractionLocality::NonLocal && + nb.plist[iloc]->nsci == 0); } +} // namespace Nbnxm + #endif diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h index 94d92a77e3..f578cf5cb9 100644 --- a/src/gromacs/nbnxm/gpu_data_mgmt.h +++ b/src/gromacs/nbnxm/gpu_data_mgmt.h @@ -48,6 +48,7 @@ #include "gromacs/mdtypes/interaction_const.h" #include "gpu_types.h" +#include "locality.h" struct nonbonded_verlet_group_t; struct NbnxnPairlistGpu; @@ -57,90 +58,95 @@ struct gmx_wallclock_gpu_nbnxn_t; struct gmx_gpu_info_t; struct gmx_device_info_t; +namespace Nbnxm +{ + /** Initializes the data structures related to GPU nonbonded calculations. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb, - const gmx_device_info_t gmx_unused *deviceInfo, - const interaction_const_t gmx_unused *ic, - const NbnxnListParameters gmx_unused *listParams, - const nbnxn_atomdata_t gmx_unused *nbat, - int gmx_unused rank, - /* true if both local and non-local are done on GPU */ - gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM +void gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb, + const gmx_device_info_t gmx_unused *deviceInfo, + const interaction_const_t gmx_unused *ic, + const NbnxnListParameters gmx_unused *listParams, + const nbnxn_atomdata_t gmx_unused *nbat, + int gmx_unused rank, + /* true if both local and non-local are done on GPU */ + gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM /** Initializes pair-list data for GPU, called at every pair search step. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb, - const struct NbnxnPairlistGpu gmx_unused *h_nblist, - int gmx_unused iloc) GPU_FUNC_TERM +void gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb, + const struct NbnxnPairlistGpu gmx_unused *h_nblist, + InteractionLocality gmx_unused iloc) GPU_FUNC_TERM /** Initializes atom-data on the GPU, called at every pair search step. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb, - const nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM +void gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb, + const nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the * electrostatic type switching to twin cut-off (or back) if needed. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv, - const interaction_const_t gmx_unused *ic, - const NbnxnListParameters gmx_unused *listParams) GPU_FUNC_TERM +void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv, + const interaction_const_t gmx_unused *ic, + const NbnxnListParameters gmx_unused *listParams) GPU_FUNC_TERM /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb, - const nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM +void gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb, + const nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM /** Clears GPU outputs: nonbonded force, shift force and energy. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags) GPU_FUNC_TERM +void gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb, + int gmx_unused flags) GPU_FUNC_TERM /** Frees all GPU resources used for the nonbonded calculations. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM +void gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */ GPU_FUNC_QUALIFIER -struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr) +struct gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr) /** Resets nonbonded GPU timings. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM +void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM /** Calculates the minimum size of proximity lists to improve SM load balance * with GPU non-bonded kernels. */ -GPU_FUNC_QUALIFIER -int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1) + GPU_FUNC_QUALIFIER +int gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1) /** Returns if analytical Ewald GPU kernels are used. */ GPU_FUNC_QUALIFIER -gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE) +gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE) /** Returns an opaque pointer to the GPU command stream * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr) +void *gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb, + InteractionLocality gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr) /** Returns an opaque pointer to the GPU coordinate+charge array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) +void *gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) /** Returns an opaque pointer to the GPU force array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) +void *gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) /** Returns an opaque pointer to the GPU shift force array * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) + rvec *gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr) + +} // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 78893704bb..4c57f22bcd 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -45,6 +45,9 @@ #include "config.h" #include "gromacs/nbnxm/pairlist.h" +#include "gromacs/utility/enumerationhelpers.h" + +#include "locality.h" #if GMX_GPU == GMX_GPU_OPENCL #include "gromacs/gpu_utils/gpuregiontimer_ocl.h" @@ -54,24 +57,46 @@ #include "gromacs/gpu_utils/gpuregiontimer.cuh" #endif +namespace Nbnxm +{ + /*! \internal * \brief GPU region timers used for timing GPU kernels and H2D/D2H transfers. * * The two-sized arrays hold the local and non-local values and should always * be indexed with eintLocal/eintNonlocal. */ -struct nbnxn_gpu_timers_t +struct gpu_timers_t { - GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */ - GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */ - GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */ - GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */ - bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */ - GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */ - GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */ - bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */ - GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */ - bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */ + /*! \internal + * \brief Timers for local or non-local coordinate/force transfers + */ + struct XFTransfers + { + GpuRegionTimer nb_h2d; /**< timer for x/q H2D transfers (l/nl, every step) */ + GpuRegionTimer nb_d2h; /**< timer for f D2H transfer (l/nl, every step) */ + }; + + /*! \internal + * \brief Timers for local or non-local interaction related operations + */ + struct Interaction + { + GpuRegionTimer pl_h2d; /**< timer for pair-list H2D transfers (l/nl, every PS step) */ + bool didPairlistH2D = false; /**< true when a pair-list transfer has been done at this step */ + GpuRegionTimer nb_k; /**< timer for non-bonded kernels (l/nl, every step) */ + GpuRegionTimer prune_k; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */ + bool didPrune = false; /**< true when we timed pruning and the timings need to be accounted for */ + GpuRegionTimer rollingPrune_k; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */ + bool didRollingPrune = false; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */ + }; + + //! timer for atom data transfer (every PS step) + GpuRegionTimer atdat; + //! timers for coordinate/force transfers (every step) + gmx::EnumerationArray xf; + //! timers for interaction related transfers + gmx::EnumerationArray interaction; }; struct gpu_plist @@ -99,4 +124,6 @@ struct gpu_plist int rollingPruningPart; /**< the next part to which the roling pruning needs to be applied */ }; +} // namespace Nbnxm + #endif diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index 276a2e69e3..14402aca91 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -413,13 +413,13 @@ nbnxn_kernel_cpu(const nonbonded_verlet_group_t *nbvg, } } -static void accountFlops(t_nrnb *nrnb, - const nonbonded_verlet_t &nbv, - const int ilocality, - const interaction_const_t &ic, - int forceFlags) +static void accountFlops(t_nrnb *nrnb, + const nonbonded_verlet_t &nbv, + const Nbnxm::InteractionLocality iLocality, + const interaction_const_t &ic, + const int forceFlags) { - const nonbonded_verlet_group_t &nbvg = nbv.grp[ilocality]; + const nonbonded_verlet_group_t &nbvg = nbv.grp[iLocality]; const bool usingGpuKernels = (nbvg.kernel_type == nbnxnk8x8x8_GPU); int enr_nbnxn_kernel_ljc; @@ -428,7 +428,7 @@ static void accountFlops(t_nrnb *nrnb, enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF; } else if ((!usingGpuKernels && nbvg.ewald_excl == ewaldexclAnalytical) || - (usingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv.gpu_nbv))) + (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv))) { enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD; } @@ -474,7 +474,7 @@ static void accountFlops(t_nrnb *nrnb, } void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, - const int ilocality, + Nbnxm::InteractionLocality iLocality, const interaction_const_t &ic, int forceFlags, int clearF, @@ -482,7 +482,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, gmx_enerdata_t *enerd, t_nrnb *nrnb) { - const nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality]; + const nonbonded_verlet_group_t &nbvg = nbv->grp[iLocality]; switch (nbvg.kernel_type) { @@ -503,7 +503,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, break; case nbnxnk8x8x8_GPU: - nbnxn_gpu_launch_kernel(nbv->gpu_nbv, forceFlags, ilocality); + Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality); break; case nbnxnk8x8x8_PlainC: @@ -525,5 +525,5 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, } - accountFlops(nrnb, *nbv, ilocality, ic, forceFlags); + accountFlops(nrnb, *nbv, iLocality, ic, forceFlags); } diff --git a/src/gromacs/nbnxm/locality.h b/src/gromacs/nbnxm/locality.h new file mode 100644 index 0000000000..6ebd4fb409 --- /dev/null +++ b/src/gromacs/nbnxm/locality.h @@ -0,0 +1,76 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines nbnxn locality enums + * + * \author Berk Hess + * \ingroup module_nbnxm + */ + +#ifndef GMX_NBNXM_LOCALITY_H +#define GMX_NBNXM_LOCALITY_H + +namespace Nbnxm +{ + +/*! \brief Atom locality indicator: local, non-local, all. + * + * Used for calls to: + * gridding, force calculation, x/f buffer operations + */ +enum class AtomLocality : int +{ + Local = 0, //!< Local atoms + NonLocal = 1, //!< Non-local atoms + All = 2, //!< Both local and non-local atoms + Count = 3 //!< The number of atom locality types +}; + +/*! \brief Interaction locality indicator: local, non-local, all. + * + * Used for calls to: + * pair-search, force calculation, x/f buffer operations + */ +enum class InteractionLocality : int +{ + Local = 0, //!< Interactions between local atoms only + NonLocal = 1, //!< Interactions between non-local and (non-)local atoms + Count = 2 //!< The number of interaction locality types +}; + +} // namespace Nbnxm + +#endif // GMX_NBNXM_LOCALITY_H diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 59886a5f8b..9e0138e162 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -104,8 +104,11 @@ #include "gromacs/nbnxm/pairlist.h" #include "gromacs/nbnxm/pairlistset.h" #include "gromacs/utility/arrayref.h" +#include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/real.h" +#include "locality.h" + // TODO: Remove this include and the two nbnxm includes above #include "nbnxm_gpu.h" @@ -148,50 +151,23 @@ typedef enum nbnxnkNR } nbnxn_kernel_type; +namespace Nbnxm +{ + /*! \brief Return a string identifying the kernel type. * * \param [in] kernel_type nonbonded kernel types, takes values from the nbnxn_kernel_type enum * \returns a string identifying the kernel corresponding to the type passed as argument */ -const char *lookup_nbnxn_kernel_name(int kernel_type); +const char *lookup_kernel_name(int kernel_type); + +} // namespace Nbnxm /*! \brief Ewald exclusion types */ enum { ewaldexclTable, ewaldexclAnalytical }; -/*! \brief Atom locality indicator: local, non-local, all. - * - * Used for calls to: - * gridding, pair-search, force calculation, x/f buffer operations - * */ -enum { - eatLocal = 0, eatNonlocal = 1, eatAll -}; - -/*! \brief Tests for local atom range */ -#define LOCAL_A(x) ((x) == eatLocal) -/*! \brief Tests for non-local atom range */ -#define NONLOCAL_A(x) ((x) == eatNonlocal) -/*! \brief Tests for either local or non-local atom range */ -#define LOCAL_OR_NONLOCAL_A(x) (LOCAL_A(x) || NONLOCAL_A(x)) - -/*! \brief Interaction locality indicator - * - * Used in pair-list search/calculations in the following manner: - * - local interactions require local atom data and affect local output only; - * - non-local interactions require both local and non-local atom data and - * affect both local- and non-local output. - */ -enum { - eintLocal = 0, eintNonlocal = 1 -}; - -/*! \brief Tests for local interaction indicator */ -#define LOCAL_I(x) ((x) == eintLocal) -/*! \brief Tests for non-local interaction indicator */ -#define NONLOCAL_I(x) ((x) == eintNonlocal) - /*! \brief Flag to tell the nonbonded kernels whether to clear the force output buffers */ enum { enbvClearFNo, enbvClearFYes @@ -207,19 +183,25 @@ typedef struct nonbonded_verlet_group_t { /*! \libinternal * \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */ -typedef struct nonbonded_verlet_t { - std::unique_ptr listParams; /**< Parameters for the search and list pruning setup */ - std::unique_ptr nbs; /**< n vs n atom pair searching data */ - int ngrp; /**< number of interaction groups */ - nonbonded_verlet_group_t grp[2]; /**< local and non-local interaction group */ - nbnxn_atomdata_t *nbat; /**< atom data */ - - gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */ - EmulateGpuNonbonded emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */ - gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */ - int min_ci_balanced; /**< pair list balancing parameter - used for the 8x8x8 GPU kernels */ -} nonbonded_verlet_t; +struct nonbonded_verlet_t +{ + std::unique_ptr listParams; /**< Parameters for the search and list pruning setup */ + std::unique_ptr nbs; /**< n vs n atom pair searching data */ + int ngrp; /**< number of interaction groups */ + //! Local and non-local interaction group + gmx::EnumerationArray grp; + //! Atom data + nbnxn_atomdata_t *nbat; + + gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */ + EmulateGpuNonbonded emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */ + gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */ + int min_ci_balanced; /**< pair list balancing parameter + used for the 8x8x8 GPU kernels */ +}; + +namespace Nbnxm +{ /*! \brief Initializes the nbnxn module */ void init_nb_verlet(const gmx::MDLogger &mdlog, @@ -233,6 +215,8 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, const gmx_mtop_t *mtop, matrix box); +} // namespace Nbnxm + /*! \brief Put the atoms on the pair search grid. * * Only atoms atomStart to atomEnd in x are put on the grid. @@ -290,18 +274,18 @@ gmx::ArrayRef nbnxn_get_gridindices(const nbnxn_search* nbs); * pairs beyond the pairlist inner radius and writes the result to a list that is * to be consumed by the non-bonded kernel. */ -void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, - int ilocality, - const rvec *shift_vec); +void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, + Nbnxm::InteractionLocality iLocality, + const rvec *shift_vec); /*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */ -void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, - int ilocality, - const interaction_const_t &ic, - int forceFlags, - int clearF, - t_forcerec *fr, - gmx_enerdata_t *enerd, - t_nrnb *nrnb); +void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, + Nbnxm::InteractionLocality iLocality, + const interaction_const_t &ic, + int forceFlags, + int clearF, + t_forcerec *fr, + gmx_enerdata_t *enerd, + t_nrnb *nrnb); #endif // GMX_NBNXN_NBNXN_H diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 6aa7c21752..8e5f77a9af 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -49,10 +49,14 @@ #include "gromacs/utility/real.h" #include "gpu_types.h" +#include "locality.h" struct nbnxn_atomdata_t; enum class GpuTaskCompletion; +namespace Nbnxm +{ + /*! \brief * Launch asynchronously the xq buffer host to device copy. * @@ -61,14 +65,14 @@ enum class GpuTaskCompletion; * * \param [in] nb GPU nonbonded data. * \param [in] nbdata Host-side atom data structure. - * \param [in] iloc Interaction locality flag. + * \param [in] aloc Atom locality flag. * \param [in] haveOtherWork True if there are other tasks that require the nbnxn coordinate input. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb, - const struct nbnxn_atomdata_t gmx_unused *nbdata, - int gmx_unused iloc, - bool gmx_unused haveOtherWork) GPU_FUNC_TERM +void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb, + const struct nbnxn_atomdata_t gmx_unused *nbdata, + AtomLocality gmx_unused aloc, + bool gmx_unused haveOtherWork) GPU_FUNC_TERM /*! \brief * Launch asynchronously the nonbonded force calculations. @@ -81,9 +85,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb, * */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - int gmx_unused iloc) GPU_FUNC_TERM +void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb, + int gmx_unused flags, + InteractionLocality gmx_unused iloc) GPU_FUNC_TERM /*! \brief * Launch asynchronously the nonbonded prune-only kernel. @@ -121,9 +125,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb, * \param [in] numParts Number of parts the pair list is split into in the rolling kernel. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused iloc, - int gmx_unused numParts) GPU_FUNC_TERM +void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, + InteractionLocality gmx_unused iloc, + int gmx_unused numParts) GPU_FUNC_TERM /*! \brief * Launch asynchronously the download of nonbonded forces from the GPU @@ -132,11 +136,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, * no non-bonded work. */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, - struct nbnxn_atomdata_t gmx_unused *nbatom, - int gmx_unused flags, - int gmx_unused aloc, - bool gmx_unused haveOtherWork) GPU_FUNC_TERM +void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, + nbnxn_atomdata_t gmx_unused *nbatom, + int gmx_unused flags, + AtomLocality gmx_unused aloc, + bool gmx_unused haveOtherWork) GPU_FUNC_TERM /*! \brief Attempts to complete nonbonded GPU task. * @@ -172,14 +176,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, * \returns True if the nonbonded tasks associated with \p aloc locality have completed */ GPU_FUNC_QUALIFIER -bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - int gmx_unused aloc, - bool gmx_unused haveOtherWork, - real gmx_unused *e_lj, - real gmx_unused *e_el, - rvec gmx_unused *fshift, - GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false) +bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, + int gmx_unused flags, + AtomLocality gmx_unused aloc, + bool gmx_unused haveOtherWork, + real gmx_unused *e_lj, + real gmx_unused *e_el, + rvec gmx_unused *fshift, + GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false) /*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data * transfers to finish. @@ -197,16 +201,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, * \param[out] fshift Pointer to the shift force buffer to accumulate into */ GPU_FUNC_QUALIFIER -void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - int gmx_unused aloc, - bool gmx_unused haveOtherWork, - real gmx_unused *e_lj, - real gmx_unused *e_el, - rvec gmx_unused *fshift) GPU_FUNC_TERM +void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, + int gmx_unused flags, + AtomLocality gmx_unused aloc, + bool gmx_unused haveOtherWork, + real gmx_unused *e_lj, + real gmx_unused *e_el, + rvec gmx_unused *fshift) GPU_FUNC_TERM /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ GPU_FUNC_QUALIFIER -int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1) +int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1) + +} // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index d4dafe11c7..0565d3faf0 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -62,6 +62,9 @@ #include "grid.h" #include "internal.h" +namespace Nbnxm +{ + /*! \brief Returns whether CPU SIMD support exists for the given inputrec * * If the return value is FALSE and fplog/cr != NULL, prints a fallback @@ -134,7 +137,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, /* One 256-bit FMA per cycle makes 2xNN faster */ *kernel_type = nbnxnk4xN_SIMD_2xNN; } -#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */ +#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */ if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr) @@ -181,10 +184,10 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, } } -#endif // GMX_SIMD +#endif // GMX_SIMD } -const char *lookup_nbnxn_kernel_name(int kernel_type) +const char *lookup_kernel_name(int kernel_type) { const char *returnvalue = nullptr; switch (kernel_type) @@ -260,7 +263,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog, { GMX_LOG(mdlog.info).asParagraph().appendTextFormatted( "Using %s %dx%d nonbonded short-range kernels", - lookup_nbnxn_kernel_name(*kernel_type), + lookup_kernel_name(*kernel_type), nbnxn_kernel_to_cluster_i_size(*kernel_type), nbnxn_kernel_to_cluster_j_size(*kernel_type)); @@ -270,7 +273,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog, GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( "WARNING: Using the slow %s kernels. This should\n" "not happen during routine usage on supported platforms.", - lookup_nbnxn_kernel_name(*kernel_type)); + lookup_kernel_name(*kernel_type)); } } } @@ -388,13 +391,13 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, { /* init the NxN GPU data; the last argument tells whether we'll have * both local and non-local NB calculation on GPU */ - nbnxn_gpu_init(&nbv->gpu_nbv, - deviceInfo, - fr->ic, - nbv->listParams.get(), - nbv->nbat, - cr->nodeid, - (nbv->ngrp > 1)); + gpu_init(&nbv->gpu_nbv, + deviceInfo, + fr->ic, + nbv->listParams.get(), + nbv->nbat, + cr->nodeid, + (nbv->ngrp > 1)); if ((env = getenv("GMX_NB_MIN_CI")) != nullptr) { @@ -414,7 +417,7 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, } else { - nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv); + nbv->min_ci_balanced = gpu_min_ci_balanced(nbv->gpu_nbv); if (debug) { fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n", @@ -426,3 +429,5 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, *nb_verlet = nbv; } + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 8aac82d3d3..34132dbbd7 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -88,6 +88,9 @@ #include "nbnxm_ocl_internal.h" #include "nbnxm_ocl_types.h" +namespace Nbnxm +{ + /*! \brief Convenience constants */ //@{ static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster; @@ -359,19 +362,22 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event) } /*! \brief Launch asynchronously the xq buffer host to device copy. */ -void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, - const nbnxn_atomdata_t *nbatom, - int iloc, - bool haveOtherWork) +void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, + const nbnxn_atomdata_t *nbatom, + const AtomLocality atomLocality, + const bool haveOtherWork) { - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + + /* local/nonlocal offset and length used for xq and f */ + int adat_begin, adat_len; cl_atomdata_t *adat = nb->atdat; cl_plist_t *plist = nb->plist[iloc]; cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; - bool bDoTime = (nb->bDoTime) != 0; + bool bDoTime = (nb->bDoTime) != 0; /* Don't launch the non-local H2D copy if there is no dependent work to do: neither non-local nor other (e.g. bonded) work @@ -382,7 +388,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, we always call the local local x+q copy (and the rest of the local work in nbnxn_gpu_launch_kernel(). */ - if (!haveOtherWork && canSkipWork(nb, iloc)) + if (!haveOtherWork && canSkipWork(*nb, iloc)) { plist->haveFreshList = false; @@ -390,7 +396,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, } /* calculate the atom data index range based on locality */ - if (LOCAL_I(iloc)) + if (atomLocality == AtomLocality::Local) { adat_begin = 0; adat_len = adat->natoms_local; @@ -404,16 +410,16 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, /* beginning of timed HtoD section */ if (bDoTime) { - t->nb_h2d[iloc].openTimingRegion(stream); + t->xf[atomLocality].nb_h2d.openTimingRegion(stream); } /* HtoD x, q */ ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4, - adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr); + adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - t->nb_h2d[iloc].closeTimingRegion(stream); + t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); } /* When we get here all misc operations issues in the local stream as well as @@ -421,7 +427,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, so we record that in the local stream and wait for it in the nonlocal one. */ if (nb->bUseTwoStreams) { - if (iloc == eintLocal) + if (iloc == InteractionLocality::Local) { cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done)); assert(CL_SUCCESS == cl_error); @@ -459,9 +465,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream. */ -void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, - int flags, - int iloc) +void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, + const int flags, + const Nbnxm::InteractionLocality iloc) { /* OpenCL kernel launch-related stuff */ cl_kernel nb_kernel = nullptr; /* fn pointer to the nonbonded kernel */ @@ -487,7 +493,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ - if (canSkipWork(nb, iloc)) + if (canSkipWork(*nb, iloc)) { plist->haveFreshList = false; @@ -500,7 +506,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, (that's the way the timing accounting can distinguish between separate prune kernel and combined force+prune). */ - nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1); + Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1); } if (plist->nsci == 0) @@ -513,7 +519,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->nb_k[iloc].openTimingRegion(stream); + t->interaction[iloc].nb_k.openTimingRegion(stream); } /* get the pointer to the kernel flavor we need to use */ @@ -521,7 +527,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, nbp->eeltype, nbp->vdwtype, bCalcEner, - (plist->haveFreshList && !nb->timers->didPrune[iloc])); + (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune)); /* kernel launch config */ @@ -545,7 +551,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, fillin_ocl_structures(nbp, &nbparams_params); - auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr; + auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; constexpr char kernelName[] = "k_calc_nb"; if (useLjCombRule(nb->nbparam->vdwtype)) { @@ -570,7 +576,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, if (bDoTime) { - t->nb_k[iloc].closeTimingRegion(stream); + t->interaction[iloc].nb_k.closeTimingRegion(stream); } } @@ -600,9 +606,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z) return shmem; } -void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, - int iloc, - int numParts) +void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, + const InteractionLocality iloc, + const int numParts) { cl_atomdata_t *adat = nb->atdat; cl_nbparam_t *nbp = nb->nbparam; @@ -656,7 +662,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, GpuRegionTimer *timer = nullptr; if (bDoTime) { - timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]); + timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k); } /* beginning of timed prune calculation section */ @@ -708,12 +714,12 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, { plist->haveFreshList = false; /* Mark that pruning has been done */ - nb->timers->didPrune[iloc] = true; + nb->timers->interaction[iloc].didPrune = true; } else { /* Mark that rolling pruning has been done */ - nb->timers->didRollingPrune[iloc] = true; + nb->timers->interaction[iloc].didRollingPrune = true; } if (bDoTime) @@ -726,29 +732,29 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, * Launch asynchronously the download of nonbonded forces from the GPU * (and energies/shift forces if required). */ -void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, - struct nbnxn_atomdata_t *nbatom, - int flags, - int aloc, - bool haveOtherWork) +void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, + struct nbnxn_atomdata_t *nbatom, + const int flags, + const AtomLocality aloc, + const bool haveOtherWork) { cl_int gmx_unused cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* determine interaction locality from atom locality */ - int iloc = gpuAtomToInteractionLocality(aloc); + const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc); - cl_atomdata_t *adat = nb->atdat; - cl_timers_t *t = nb->timers; - bool bDoTime = nb->bDoTime == CL_TRUE; - cl_command_queue stream = nb->stream[iloc]; + cl_atomdata_t *adat = nb->atdat; + cl_timers_t *t = nb->timers; + bool bDoTime = nb->bDoTime == CL_TRUE; + cl_command_queue stream = nb->stream[iloc]; - bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; - int bCalcFshift = flags & GMX_FORCE_VIRIAL; + bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; + int bCalcFshift = flags & GMX_FORCE_VIRIAL; /* don't launch non-local copy-back if there was no non-local work to do */ - if (!haveOtherWork && canSkipWork(nb, iloc)) + if (!haveOtherWork && canSkipWork(*nb, iloc)) { /* TODO An alternative way to signal that non-local work is complete is to use a clEnqueueMarker+clEnqueueBarrier @@ -767,19 +773,19 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, /* beginning of timed D2H section */ if (bDoTime) { - t->nb_d2h[iloc].openTimingRegion(stream); + t->xf[aloc].nb_d2h.openTimingRegion(stream); } /* With DD the local D2H transfer can only start after the non-local has been launched. */ - if (iloc == eintLocal && nb->bNonLocalStreamActive) + if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive) { sync_ocl_event(stream, &(nb->nonlocal_done)); } /* DtoH f */ ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float), - (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); + (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); /* kick off work */ cl_error = clFlush(stream); @@ -789,7 +795,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, recorded which signals that the local D2H can proceed. This event is not placed after the non-local kernel because we first need the non-local data back first. */ - if (iloc == eintNonlocal) + if (iloc == InteractionLocality::NonLocal) { cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done)); assert(CL_SUCCESS == cl_error); @@ -797,35 +803,35 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, } /* only transfer energies in the local stream */ - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { /* DtoH fshift */ if (bCalcFshift) { ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, - SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); + SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ if (bCalcEner) { ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, - sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); + sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, - sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr); + sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } } if (bDoTime) { - t->nb_d2h[iloc].closeTimingRegion(stream); + t->xf[aloc].nb_d2h.closeTimingRegion(stream); } } /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ -int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut) +int gpu_pick_ewald_kernel_type(const bool bTwinCut) { bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald; int kernel_type; @@ -879,3 +885,5 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut) return kernel_type; } + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 27a2cd806e..72be765441 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -50,6 +50,11 @@ #include +// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t +// is currently declared means this has to be before gpu_types.h +#include "nbnxm_ocl_types.h" + +// TODO Remove this comment when the above order issue is resolved #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/hardware/gpu_hw_info.h" @@ -70,7 +75,9 @@ #include "gromacs/utility/smalloc.h" #include "nbnxm_ocl_internal.h" -#include "nbnxm_ocl_types.h" + +namespace Nbnxm +{ /*! \brief This parameter should be determined heuristically from the * kernel execution times @@ -276,7 +283,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic, else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD)) { /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */ - *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false); + *gpu_eeltype = gpu_pick_ewald_kernel_type(false); } else { @@ -398,11 +405,11 @@ static void init_nbparam(cl_nbparam_t *nbp, } //! This function is documented in the header file -void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, - const interaction_const_t *ic, - const NbnxnListParameters *listParams) +void gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, + const interaction_const_t *ic, + const NbnxnListParameters *listParams) { - if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU) + if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU) { return; } @@ -411,7 +418,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, set_cutoff_parameters(nbp, ic, listParams); - nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw); + nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw); init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata); } @@ -440,19 +447,6 @@ static void init_plist(cl_plist_t *pl) pl->haveFreshList = false; } -/*! \brief Initializes the timer data structure. - */ -static void init_timers(cl_timers_t *t, - bool bUseTwoStreams) -{ - for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++) - { - t->didPairlistH2D[i] = false; - t->didPrune[i] = false; - t->didRollingPrune[i] = false; - } -} - /*! \brief Initializes the timings data structure. */ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) @@ -547,7 +541,7 @@ nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb) cl_int cl_error; cl_atomdata_t * adat = nb->atdat; - cl_command_queue ls = nb->stream[eintLocal]; + cl_command_queue ls = nb->stream[InteractionLocality::Local]; size_t local_work_size[3] = {1, 1, 1}; size_t global_work_size[3] = {1, 1, 1}; @@ -613,13 +607,13 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t *nb, //! This function is documented in the header file -void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, - const gmx_device_info_t *deviceInfo, - const interaction_const_t *ic, - const NbnxnListParameters *listParams, - const nbnxn_atomdata_t *nbat, - int rank, - gmx_bool bLocalAndNonlocal) +void gpu_init(gmx_nbnxn_ocl_t **p_nb, + const gmx_device_info_t *deviceInfo, + const interaction_const_t *ic, + const NbnxnListParameters *listParams, + const nbnxn_atomdata_t *nbat, + const int rank, + const gmx_bool bLocalAndNonlocal) { gmx_nbnxn_ocl_t *nb; cl_int cl_error; @@ -635,10 +629,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, snew(nb, 1); snew(nb->atdat, 1); snew(nb->nbparam, 1); - snew(nb->plist[eintLocal], 1); + snew(nb->plist[InteractionLocality::Local], 1); if (bLocalAndNonlocal) { - snew(nb->plist[eintNonlocal], 1); + snew(nb->plist[InteractionLocality::NonLocal], 1); } nb->bUseTwoStreams = static_cast(bLocalAndNonlocal); @@ -655,7 +649,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, pmalloc(reinterpret_cast(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); pmalloc(reinterpret_cast(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); - init_plist(nb->plist[eintLocal]); + init_plist(nb->plist[InteractionLocality::Local]); /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */ nb->bDoTime = static_cast(getenv("GMX_DISABLE_GPU_TIMING") == nullptr); @@ -673,7 +667,8 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank); /* local/non-local GPU streams */ - nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); + nb->stream[InteractionLocality::Local] = + clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", @@ -684,9 +679,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, if (nb->bUseTwoStreams) { - init_plist(nb->plist[eintNonlocal]); + init_plist(nb->plist[InteractionLocality::NonLocal]); - nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); + nb->stream[InteractionLocality::NonLocal] = + clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", @@ -698,7 +694,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, if (nb->bDoTime) { - init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE); init_timings(nb->timings); } @@ -742,7 +737,7 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear) cl_int gmx_used_in_debug cl_error; cl_atomdata_t *atomData = nb->atdat; - cl_command_queue ls = nb->stream[eintLocal]; + cl_command_queue ls = nb->stream[InteractionLocality::Local]; cl_float value = 0.0f; cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), @@ -753,8 +748,8 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear) //! This function is documented in the header file void -nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t *nb, - int flags) +gpu_clear_outputs(gmx_nbnxn_ocl_t *nb, + const int flags) { nbnxn_ocl_clear_f(nb, nb->atdat->natoms); /* clear shift force array and energies if the outputs were @@ -766,14 +761,14 @@ nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t *nb, /* kick off buffer clearing kernel to ensure concurrency with constraints/update */ cl_int gmx_unused cl_error; - cl_error = clFlush(nb->stream[eintLocal]); + cl_error = clFlush(nb->stream[InteractionLocality::Local]); assert(CL_SUCCESS == cl_error); } //! This function is documented in the header file -void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, - const NbnxnPairlistGpu *h_plist, - int iloc) +void gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, + const NbnxnPairlistGpu *h_plist, + const InteractionLocality iloc) { char sbuf[STRLEN]; // Timing accumulation should happen only if there was work to do @@ -797,10 +792,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, } } + gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc]; + if (bDoTime) { - nb->timers->pl_h2d[iloc].openTimingRegion(stream); - nb->timers->didPairlistH2D[iloc] = true; + iTimers.pl_h2d.openTimingRegion(stream); + iTimers.didPairlistH2D = true; } // TODO most of this function is same in CUDA and OpenCL, move into the header @@ -810,13 +807,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, &d_plist->nsci, &d_plist->sci_nalloc, context); copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context); copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit, &d_plist->nimask, &d_plist->imask_nalloc, context); @@ -825,11 +822,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, &d_plist->nexcl, &d_plist->excl_nalloc, context); copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, GpuApiCallBehavior::Async, - bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr); + bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - nb->timers->pl_h2d[iloc].closeTimingRegion(stream); + iTimers.pl_h2d.closeTimingRegion(stream); } /* need to prune the pair list during the next step */ @@ -837,11 +834,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, } //! This function is documented in the header file -void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, - const nbnxn_atomdata_t *nbatom) +void gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, + const nbnxn_atomdata_t *nbatom) { cl_atomdata_t *adat = nb->atdat; - cl_command_queue ls = nb->stream[eintLocal]; + cl_command_queue ls = nb->stream[InteractionLocality::Local]; /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) @@ -853,8 +850,8 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, } //! This function is documented in the header file -void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, - const nbnxn_atomdata_t *nbat) +void gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, + const nbnxn_atomdata_t *nbat) { cl_int cl_error; int nalloc, natoms; @@ -862,7 +859,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, bool bDoTime = nb->bDoTime == CL_TRUE; cl_timers_t *timers = nb->timers; cl_atomdata_t *d_atdat = nb->atdat; - cl_command_queue ls = nb->stream[eintLocal]; + cl_command_queue ls = nb->stream[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -1011,7 +1008,7 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData) } //! This function is documented in the header file -void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) +void gpu_free(gmx_nbnxn_ocl_t *nb) { if (nb == nullptr) { @@ -1051,7 +1048,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) sfree(nb->nbparam); /* Free plist */ - auto *plist = nb->plist[eintLocal]; + auto *plist = nb->plist[InteractionLocality::Local]; freeDeviceBuffer(&plist->sci); freeDeviceBuffer(&plist->cj4); freeDeviceBuffer(&plist->imask); @@ -1059,7 +1056,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) sfree(plist); if (nb->bUseTwoStreams) { - auto *plist_nl = nb->plist[eintNonlocal]; + auto *plist_nl = nb->plist[InteractionLocality::NonLocal]; freeDeviceBuffer(&plist_nl->sci); freeDeviceBuffer(&plist_nl->cj4); freeDeviceBuffer(&plist_nl->imask); @@ -1078,12 +1075,12 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) nb->nbst.fshift = nullptr; /* Free command queues */ - clReleaseCommandQueue(nb->stream[eintLocal]); - nb->stream[eintLocal] = nullptr; + clReleaseCommandQueue(nb->stream[InteractionLocality::Local]); + nb->stream[InteractionLocality::Local] = nullptr; if (nb->bUseTwoStreams) { - clReleaseCommandQueue(nb->stream[eintNonlocal]); - nb->stream[eintNonlocal] = nullptr; + clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]); + nb->stream[InteractionLocality::NonLocal] = nullptr; } /* Free other events */ if (nb->nonlocal_done) @@ -1112,13 +1109,13 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) } //! This function is documented in the header file -gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb) +gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb) { return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } //! This function is documented in the header file -void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) +void gpu_reset_timings(nonbonded_verlet_t* nbv) { if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime) { @@ -1127,15 +1124,17 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) } //! This function is documented in the header file -int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb) +int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb) { return nb != nullptr ? gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0; } //! This function is documented in the header file -gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb) +gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb) { return ((nb->nbparam->eeltype == eelOclEWALD_ANA) || (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN)); } + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h index 318e72623d..c735ab5d48 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h @@ -46,6 +46,9 @@ #ifndef NBNXN_OCL_INTERNAL_H #define NBNXN_OCL_INTERNAL_H +namespace Nbnxm +{ + /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels. * * \param[in] vdwType The VdW interaction/implementation type as defined by evdwOcl in nbnxn_ocl_types.h. @@ -53,4 +56,6 @@ */ bool useLjCombRule(int vdwType); +} // namespace Nbnxm + #endif /* NBNXN_OCL_INTERNAL_H */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 75279fb3cd..57d945bdcb 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -51,7 +51,9 @@ #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/nbnxm/gpu_types_common.h" +#include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/pairlist.h" +#include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/real.h" @@ -243,12 +245,12 @@ typedef struct cl_nbparam_params /*! \internal * \brief Pair list data. */ -using cl_plist_t = gpu_plist; +using cl_plist_t = Nbnxm::gpu_plist; /** \internal * \brief Typedef of actual timer type. */ -typedef struct nbnxn_gpu_timers_t cl_timers_t; +typedef struct Nbnxm::gpu_timers_t cl_timers_t; /*! \internal * \brief Main data structure for OpenCL nonbonded force calculations. @@ -278,15 +280,15 @@ struct gmx_nbnxn_ocl_t cl_kernel kernel_zero_e_fshift; ///@} - cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */ - cl_bool bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued */ + cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */ + cl_bool bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued */ - cl_atomdata_t *atdat; /**< atom data */ - cl_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */ - cl_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */ - cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */ + cl_atomdata_t *atdat; /**< atom data */ + cl_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */ + gmx::EnumerationArray plist; /**< pair-list data structures (local and non-local) */ + cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */ - cl_command_queue stream[2]; /**< local and non-local GPU queues */ + gmx::EnumerationArray stream; /**< local and non-local GPU queues */ /** events used for synchronization */ cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp index 17dd8e09fd..16f940a71a 100644 --- a/src/gromacs/nbnxm/pairlist.cpp +++ b/src/gromacs/nbnxm/pairlist.cpp @@ -75,6 +75,8 @@ using namespace gmx; // TODO: Remove when this file is moved into gmx namespace +// Convience alias for partial Nbnxn namespace usage +using InteractionLocality = Nbnxm::InteractionLocality; /* We shift the i-particles backward for PBC. * This leads to more conditionals than shifting forward. @@ -2650,12 +2652,12 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls, } /* Estimates the average size of a full j-list for super/sub setup */ -static void get_nsubpair_target(const nbnxn_search *nbs, - int iloc, - real rlist, - int min_ci_balanced, - int *nsubpair_target, - float *nsubpair_tot_est) +static void get_nsubpair_target(const nbnxn_search *nbs, + const InteractionLocality iloc, + const real rlist, + const int min_ci_balanced, + int *nsubpair_target, + float *nsubpair_tot_est) { /* The target value of 36 seems to be the optimum for Kepler. * Maxwell is less sensitive to the exact value. @@ -2702,7 +2704,7 @@ static void get_nsubpair_target(const nbnxn_search *nbs, nonlocal_vol2(nbs->zones, ls, r_eff_sup); } - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { /* Sub-cell interacts with itself */ vol_est = ls[XX]*ls[YY]*ls[ZZ]; @@ -4077,15 +4079,15 @@ static void sort_sci(NbnxnPairlistGpu *nbl) } /* Make a local or non-local pair-list, depending on iloc */ -void nbnxn_make_pairlist(nbnxn_search *nbs, - nbnxn_atomdata_t *nbat, - const t_blocka *excl, - real rlist, - int min_ci_balanced, - nbnxn_pairlist_set_t *nbl_list, - int iloc, - int nb_kernel_type, - t_nrnb *nrnb) +void nbnxn_make_pairlist(nbnxn_search *nbs, + nbnxn_atomdata_t *nbat, + const t_blocka *excl, + const real rlist, + const int min_ci_balanced, + nbnxn_pairlist_set_t *nbl_list, + const InteractionLocality iloc, + const int nb_kernel_type, + t_nrnb *nrnb) { int nsubpair_target; float nsubpair_tot_est; @@ -4105,13 +4107,13 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, nbat->bUseBufferFlags = (nbat->out.size() > 1); /* We should re-init the flags before making the first list */ - if (nbat->bUseBufferFlags && LOCAL_I(iloc)) + if (nbat->bUseBufferFlags && iloc == InteractionLocality::Local) { init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms()); } int nzi; - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { /* Only zone (grid) 0 vs 0 */ nzi = 1; @@ -4156,7 +4158,7 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, int zj0; int zj1; - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { zj0 = 0; zj1 = 1; @@ -4186,7 +4188,7 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, /* With GPU: generate progressively smaller lists for * load balancing for local only or non-local with 2 zones. */ - progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2); + progBal = (iloc == InteractionLocality::Local || nbs->zones->n <= 2); #pragma omp parallel for num_threads(nnbl) schedule(static) for (int th = 0; th < nnbl; th++) @@ -4342,12 +4344,12 @@ void nbnxn_make_pairlist(nbnxn_search *nbs, } /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */ - if (LOCAL_I(iloc)) + if (iloc == InteractionLocality::Local) { nbs->search_count++; } if (nbs->print_cycles && - (!nbs->DomDec || !LOCAL_I(iloc)) && + (!nbs->DomDec || iloc == InteractionLocality::NonLocal) && nbs->search_count % 100 == 0) { nbs_cycle_print(stderr, nbs); diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h index 0e92eb3cde..885e8f07be 100644 --- a/src/gromacs/nbnxm/pairlistset.h +++ b/src/gromacs/nbnxm/pairlistset.h @@ -40,6 +40,8 @@ #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" +#include "locality.h" + struct gmx_domdec_zones_t; struct gmx_groups_t; struct nbnxn_atomdata_t; @@ -86,15 +88,15 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, * for the number of equally sized lists is below min_ci_balanced. * With perturbed particles, also a group scheme style nbl_fep list is made. */ -void nbnxn_make_pairlist(nbnxn_search *nbs, - nbnxn_atomdata_t *nbat, - const t_blocka *excl, - real rlist, - int min_ci_balanced, - nbnxn_pairlist_set_t *nbl_list, - int iloc, - int nb_kernel_type, - t_nrnb *nrnb); +void nbnxn_make_pairlist(nbnxn_search *nbs, + nbnxn_atomdata_t *nbat, + const t_blocka *excl, + real rlist, + int min_ci_balanced, + nbnxn_pairlist_set_t *nbl_list, + Nbnxm::InteractionLocality iloc, + int nb_kernel_type, + t_nrnb *nrnb); /*! \brief Prepare the list-set produced by the search for dynamic pruning * diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp index 96a1883701..76938e744d 100644 --- a/src/gromacs/nbnxm/prunekerneldispatch.cpp +++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp @@ -45,9 +45,9 @@ #include "kernels_simd_4xm/kernel_prune.h" -void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, - int ilocality, - const rvec *shift_vec) +void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, + const Nbnxm::InteractionLocality ilocality, + const rvec *shift_vec) { nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality]; nbnxn_pairlist_set_t *nbl_lists = &nbvg.nbl_lists;